diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5300ce94462d7..7bda7eda68222 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1450,9 +1450,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; - if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) - return true; - auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 10cdad30913d4..1325afb68ca9a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -909,13 +909,6 @@ enum Offset_COV5 : unsigned { }; } // namespace ImplicitArg - -namespace VirtRegFlag { -// Virtual Register Flags. -enum Register_Flag : uint8_t { WWM_REG = 0 }; - -} // namespace VirtRegFlag - } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 1d7d0dfd9a949..c2bc95930272c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -66,8 +66,7 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, static void getVGPRSpillLaneOrTempRegister( MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, - const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, - bool IncludeScratchCopy = true) { + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { SIMachineFunctionInfo *MFI = MF.getInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); @@ -78,12 +77,9 @@ static void getVGPRSpillLaneOrTempRegister( // We need to save and restore the given SGPR. - Register ScratchSGPR; // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. For certain - // cases we skip copy to scratch SGPR. - if (IncludeScratchCopy) - ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + // should have all the callee saved registers marked as used. + Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -1354,8 +1350,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { - RS->enterBasicBlockEnd(MBB); - RS->backward(MI); + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; @@ -1452,10 +1448,8 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( // The special SGPR spills like the one needed for FP, BP or any reserved // registers delayed until frame lowering. void SIFrameLowering::determinePrologEpilogSGPRSaves( - MachineFunction &MF, BitVector &SavedVGPRs, - bool NeedExecCopyReservedReg) const { + MachineFunction &MF, BitVector &SavedVGPRs) const { MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1467,27 +1461,6 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( for (unsigned I = 0; CSRegs[I]; ++I) LiveRegs.addReg(CSRegs[I]); - if (NeedExecCopyReservedReg) { - Register ReservedReg = MFI->getSGPRForEXECCopy(); - assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); - const TargetRegisterClass &RC = ST.isWave32() - ? AMDGPU::SReg_32_XM0_XEXECRegClass - : AMDGPU::SGPR_64RegClass; - Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); - if (UnusedScratchReg) { - // If found any unused scratch SGPR, reserve the register itself for Exec - // copy and there is no need for any spills in that case. - MFI->setSGPRForEXECCopy(UnusedScratchReg); - LiveRegs.addReg(UnusedScratchReg); - } else { - // Needs spill. - assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && - "Re-reserving spill slot for EXEC copy register"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, - /* IncludeScratchCopy */ false); - } - } - // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1526,8 +1499,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - bool NeedExecCopyReservedReg = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -1545,8 +1516,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); - else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) - NeedExecCopyReservedReg = true; } } @@ -1559,7 +1528,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); - determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); + determinePrologEpilogSGPRSaves(MF, SavedVGPRs); // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't // allow the default insertion to handle them. diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 0060fc0be431b..def07dc4b1f7e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -34,8 +34,8 @@ class SIFrameLowering final : public AMDGPUFrameLowering { RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; - void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, - bool NeedExecCopyReservedReg) const; + void determinePrologEpilogSGPRSaves(MachineFunction &MF, + BitVector &SavedRegs) const; void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 76fd98a174b64..8a627fb79a0ed 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12589,14 +12589,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } - // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. - unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); - Register SReg = - ST.isWave32() - ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) - : AMDGPU::SGPR_64RegClass.getRegister((MaxNumSGPRs / 2) - 1); - Info->setSGPRForEXECCopy(SReg); - TargetLoweringBase::finalizeLowering(MF); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b29fa1ae77184..492f06c97a860 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1567,28 +1567,6 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { - // Currently, there is only 32-bit WWM register spills needed. - if (Size != 4) - llvm_unreachable("unknown wwm register spill size"); - - return AMDGPU::SI_SPILL_WWM_V32_SAVE; -} - -static unsigned getVectorRegSpillSaveOpcode(Register Reg, - const TargetRegisterClass *RC, - unsigned Size, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - // Choose the right opcode if spilling a WWM register. - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillSaveOpcode(Size); - - return TRI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(Size) - : TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) - : getVGPRSpillSaveOpcode(Size); -} - void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -1633,8 +1611,11 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, - SpillSize, RI, *MFI); + unsigned Opcode = RI.isVectorSuperClass(RC) + ? getAVSpillSaveOpcode(SpillSize) + : RI.isAGPRClass(RC) + ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1785,27 +1766,6 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { - // Currently, there is only 32-bit WWM register spills needed. - if (Size != 4) - llvm_unreachable("unknown wwm register spill size"); - - return AMDGPU::SI_SPILL_WWM_V32_RESTORE; -} - -static unsigned -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, - unsigned Size, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - // Choose the right opcode if restoring a WWM register. - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillRestoreOpcode(Size); - - return TRI.isVectorSuperClass(RC) ? getAVSpillRestoreOpcode(Size) - : TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) - : getVGPRSpillRestoreOpcode(Size); -} - void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1849,9 +1809,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, - SpillSize, RI, *MFI); - + unsigned Opcode = RI.isVectorSuperClass(RC) + ? getAVSpillRestoreOpcode(SpillSize) + : RI.isAGPRClass(RC) + ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index bc0f44d6ac0fa..4782b3a7bc202 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -627,11 +627,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; } - static bool isWWMRegSpillOpcode(uint16_t Opcode) { - return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || - Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; - } - static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 20722f97323f0..24384aeea21ff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -890,8 +890,6 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR ; defm SI_SPILL_AV512 : SI_SPILL_VGPR ; defm SI_SPILL_AV1024 : SI_SPILL_VGPR ; -defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR ; - def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index bb81c2556bfd4..3450a9f0681f9 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -20,7 +20,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -39,7 +38,6 @@ class SILowerSGPRSpills : public MachineFunctionPass { const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; SlotIndexes *Indexes = nullptr; - MachineDominatorTree *MDT = nullptr; // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. @@ -53,23 +51,13 @@ class SILowerSGPRSpills : public MachineFunctionPass { void calculateSaveRestoreBlocks(MachineFunction &MF); bool spillCalleeSavedRegs(MachineFunction &MF); - void updateLaneVGPRDomInstr( - int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, - DenseMap &LaneVGPRDomInstr); bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } - - MachineFunctionProperties getClearedProperties() const override { - return MachineFunctionProperties() - .set(MachineFunctionProperties::Property::IsSSA) - .set(MachineFunctionProperties::Property::NoVRegs); - } }; } // end anonymous namespace @@ -80,7 +68,6 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -261,55 +248,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } -void SILowerSGPRSpills::updateLaneVGPRDomInstr( - int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, - DenseMap &LaneVGPRDomInstr) { - // For the Def of a virtual LaneVPGR to dominate all its uses, we should - // insert an IMPLICIT_DEF before the dominating spill. Switching to a - // depth first order doesn't really help since the machine function can be in - // the unstructured control flow post-SSA. For each virtual register, hence - // finding the common dominator to get either the dominating spill or a block - // dominating all spills. Is there a better way to handle it? - SIMachineFunctionInfo *FuncInfo = - MBB->getParent()->getInfo(); - ArrayRef VGPRSpills = - FuncInfo->getSGPRSpillToVGPRLanes(FI); - Register PrevLaneVGPR; - for (auto &Spill : VGPRSpills) { - if (PrevLaneVGPR == Spill.VGPR) - continue; - - PrevLaneVGPR = Spill.VGPR; - auto I = LaneVGPRDomInstr.find(Spill.VGPR); - if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { - // Initially add the spill instruction itself for Insertion point. - LaneVGPRDomInstr[Spill.VGPR] = InsertPt; - } else { - assert(I != LaneVGPRDomInstr.end()); - auto PrevInsertPt = I->second; - MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); - if (DomMBB == MBB) { - // The insertion point earlier selected in a predecessor block whose - // spills are currently being lowered. The earlier InsertPt would be - // the one just before the block terminator and it should be changed - // if we insert any new spill in it. - if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) - I->second = InsertPt; - - continue; - } - - // Find the common dominator block between PrevInsertPt and the - // current spill. - DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); - if (DomMBB == MBB) - I->second = InsertPt; - else if (DomMBB != PrevInsertPt->getParent()) - I->second = &(*DomMBB->getFirstTerminator()); - } - } -} - bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -317,7 +255,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable(); Indexes = getAnalysisIfAvailable(); - MDT = &getAnalysis(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -327,6 +264,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { bool HasCSRs = spillCalleeSavedRegs(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); if (!MFI.hasStackObjects() && !HasCSRs) { @@ -336,6 +274,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { } bool MadeChange = false; + bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -351,9 +290,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); - // To track the IMPLICIT_DEF insertion point for the lane vgprs. - DenseMap LaneVGPRDomInstr; - for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (!TII->isSGPRSpill(MI)) @@ -361,32 +297,23 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - MachineInstrSpan MIS(&MI, &MBB); if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { + NewReservedRegs = true; bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); SpillFIs.set(FI); - updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); } } } - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { - auto InsertPt = LaneVGPRDomInstr[Reg]; - // Insert the IMPLICIT_DEF at the identified points. - auto MIB = - BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), Reg); - FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); - if (LIS) { - LIS->InsertMachineInstrInMaps(*MIB); - LIS->createAndComputeVirtRegInterval(Reg); - } - } - + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + MBB.sortUniqueLiveIns(); + // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -407,26 +334,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterClass *RC = - ST.isWave32() ? &AMDGPU::SGPR_32RegClass : &AMDGPU::SGPR_64RegClass; - // Shift back the reserved SGPR for EXEC copy into the lowest range. - // This SGPR is reserved to handle the whole-wave spill/copy operations - // that might get inserted during vgpr regalloc. - Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); - if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < - TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) - FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); - MadeChange = true; - } else { - // No SGPR spills and hence there won't be any WWM spills/copies. Reset the - // SGPR reserved for EXEC copy. - FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); RestoreBlocks.clear(); + // Updated the reserved registers with any VGPRs added for SGPR spills. + if (NewReservedRegs) + MRI.freezeReservedRegs(MF); + return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index ff6c4e0304b8c..6eea030afb00b 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -60,9 +60,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) Occupancy = ST.computeOccupancy(F, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); - const_cast(MF).getRegInfo().addDelegate(this); - VRegFlags.reserve(256); - // FIXME: Should have analysis or something rather than attribute to detect // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); @@ -310,11 +307,24 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, unsigned LaneIndex) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + SGPRSpillToVGPRLanes.erase(FI); + return false; + } + SpillVGPRs.push_back(LaneVGPR); + // Add this register as live-in to all blocks to avoid machine verifier + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } @@ -522,16 +532,6 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } -void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { - VRegFlags.grow(Reg); -} - -void SIMachineFunctionInfo::MRI_NotecloneVirtualRegister(Register NewReg, - Register SrcReg) { - VRegFlags.grow(NewReg); - VRegFlags[NewReg] = VRegFlags[SrcReg]; -} - Register SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -639,10 +639,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( if (MFI.getVGPRForAGPRCopy()) VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); - - if (MFI.getSGPRForEXECCopy()) - SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI); - auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 32e79116322b2..c0cfc36e0a962 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -275,7 +275,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { SIMode Mode; std::optional ScavengeFI; StringValue VGPRForAGPRCopy; - StringValue SGPRForEXECCopy; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, @@ -317,8 +316,6 @@ template <> struct MappingTraits { YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, StringValue()); // Don't print out when it's empty. - YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy, - StringValue()); // Don't print out when it's empty. } }; @@ -355,8 +352,7 @@ class PrologEpilogSGPRSaveRestoreInfo { /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo final : public AMDGPUMachineFunction, - private MachineRegisterInfo::Delegate { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction { friend class GCNTargetMachine; // State of MODE register, assumed FP mode. @@ -454,9 +450,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned HighBitsOf32BitAddress; - // Flags associated with the virtual registers. - IndexedMap VRegFlags; - // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -466,10 +459,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, MCPhysReg getNextSystemSGPR() const; - // MachineRegisterInfo callback functions to notify events. - void MRI_NoteNewVirtualRegister(Register Reg) override; - void MRI_NotecloneVirtualRegister(Register NewReg, Register SrcReg) override; - public: struct VGPRSpillToAGPR { SmallVector Lanes; @@ -478,11 +467,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, }; private: - // To track virtual VGPR + lane index for each subregister of the SGPR spilled - // to frameindex key during SILowerSGPRSpills pass. + // To track VGPR + lane index for each subregister of the SGPR spilled to + // frameindex key during SILowerSGPRSpills pass. DenseMap> SGPRSpillToVGPRLanes; - // To track physical VGPR + lane index for spilling special SGPRs like Frame - // Pointer identified during PrologEpilogInserter. + // To track VGPR + lane index for spilling special SGPRs like Frame Pointer + // identified during PrologEpilogInserter. DenseMap> PrologEpilogSGPRSpillToVGPRLanes; unsigned NumVGPRSpillLanes = 0; @@ -512,9 +501,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // PrologEpilogInserter. PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; - // To save/restore EXEC MASK around WWM spills and copies. - Register SGPRForEXECCopy; - DenseMap VGPRToAGPRSpills; // AGPRs used for VGPR spills. @@ -638,19 +624,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, : makeArrayRef(I->second); } - void setFlag(Register Reg, uint8_t Flag) { - assert(Reg.isVirtual()); - if (VRegFlags.inBounds(Reg)) - VRegFlags[Reg] |= (uint8_t)1 << Flag; - } - - bool checkFlag(Register Reg, uint8_t Flag) const { - if (Reg.isPhysical()) - return false; - - return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & ((uint8_t)1 << Flag); - } - void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -663,10 +636,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, return SpillAGPR; } - Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; } - - void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; } - ArrayRef getVGPRSpillAGPRs() const { return SpillVGPR; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9c524d7cb2e8d..c5ef7bf7dd001 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -646,11 +646,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } - // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. - Register ExecCopyReg = MFI->getSGPRForEXECCopy(); - if (ExecCopyReg) - reserveRegisterTuples(Reserved, ExecCopyReg); - // Reserve VGPRs/AGPRs. // unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); @@ -716,6 +711,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); + for (auto Reg : MFI->getSGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, Reg); + return Reserved; } @@ -1067,8 +1065,6 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_A32_RESTORE: case AMDGPU::SI_SPILL_AV32_SAVE: case AMDGPU::SI_SPILL_AV32_RESTORE: - case AMDGPU::SI_SPILL_WWM_V32_SAVE: - case AMDGPU::SI_SPILL_WWM_V32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -2009,40 +2005,6 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( } } -static void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - RegScavenger *RS) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - bool IsWave32 = ST.isWave32(); - if (RS->isRegUsed(AMDGPU::SCC)) { - // Insert two move instructions, one to save the original value of EXEC and - // the other to turn on all bits in EXEC. This is required as we can't use - // the single instruction S_OR_SAVEEXEC that clobbers SCC. - unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); - } else { - const unsigned OrSaveExec = - IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - auto SaveExec = - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); - SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. - } -} - -static void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); -} - bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -2141,8 +2103,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV128_SAVE: case AMDGPU::SI_SPILL_AV96_SAVE: case AMDGPU::SI_SPILL_AV64_SAVE: - case AMDGPU::SI_SPILL_AV32_SAVE: - case AMDGPU::SI_SPILL_WWM_V32_SAVE: { + case AMDGPU::SI_SPILL_AV32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2151,18 +2112,11 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; auto *MBB = MI->getParent(); - bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); - if (IsWWMRegSpill) - insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); - buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); - if (IsWWMRegSpill) - restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); - MI->eraseFromParent(); return true; } @@ -2207,8 +2161,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV352_RESTORE: case AMDGPU::SI_SPILL_AV384_RESTORE: case AMDGPU::SI_SPILL_AV512_RESTORE: - case AMDGPU::SI_SPILL_AV1024_RESTORE: - case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { + case AMDGPU::SI_SPILL_AV1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2217,17 +2170,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; auto *MBB = MI->getParent(); - bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); - if (IsWWMRegSpill) - insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); - buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); - if (IsWWMRegSpill) - restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); - MI->eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index 2b3d710fb9f2a..8ae7f0520392d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -13,7 +13,6 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: ; implicit-def: $vgpr40 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index c4169300c4f13..28a7b1a62a708 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -243,7 +243,6 @@ define void @func_caller_stack() { ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 -; MUBUF-NEXT: ; implicit-def: $vgpr40 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 @@ -281,7 +280,6 @@ define void @func_caller_stack() { ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 -; FLATSCR-NEXT: ; implicit-def: $vgpr40 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 @@ -322,9 +320,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: ; implicit-def: $vgpr40 -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -406,9 +403,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: ; implicit-def: $vgpr40 -; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 157aebb25033d..3a985d1b2deef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -8,83 +8,75 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: v_mov_b32_e32 v14, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v4 -; CHECK-NEXT: v_mov_b32_e32 v10, v5 -; CHECK-NEXT: v_mov_b32_e32 v9, v6 -; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: v_mov_b32_e32 v15, v1 +; CHECK-NEXT: v_mov_b32_e32 v14, v2 +; CHECK-NEXT: v_mov_b32_e32 v13, v3 +; CHECK-NEXT: v_mov_b32_e32 v12, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v5 +; CHECK-NEXT: v_mov_b32_e32 v10, v6 +; CHECK-NEXT: v_mov_b32_e32 v9, v7 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v1, v14 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v12 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v10 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v8 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v8, s4, 0 +; CHECK-NEXT: v_writelane_b32 v8, s5, 1 +; CHECK-NEXT: v_writelane_b32 v8, s6, 2 +; CHECK-NEXT: v_writelane_b32 v8, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: v_writelane_b32 v8, s4, 4 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v8, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v6, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v4, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v2, v15 -; CHECK-NEXT: v_mov_b32_e32 v1, v16 -; CHECK-NEXT: v_readfirstlane_b32 s12, v8 -; CHECK-NEXT: v_readfirstlane_b32 s10, v7 -; CHECK-NEXT: v_readfirstlane_b32 s9, v6 -; CHECK-NEXT: v_readfirstlane_b32 s8, v5 -; CHECK-NEXT: v_readfirstlane_b32 s7, v4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v3 -; CHECK-NEXT: v_readfirstlane_b32 s5, v2 -; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v0, v16 +; CHECK-NEXT: v_readfirstlane_b32 s12, v7 +; CHECK-NEXT: v_readfirstlane_b32 s10, v6 +; CHECK-NEXT: v_readfirstlane_b32 s9, v5 +; CHECK-NEXT: v_readfirstlane_b32 s8, v4 +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v2 +; CHECK-NEXT: v_readfirstlane_b32 s5, v1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -93,79 +85,68 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v8, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v12 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v14 -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v16 +; CHECK-NEXT: v_writelane_b32 v8, s12, 5 +; CHECK-NEXT: v_writelane_b32 v8, s13, 6 +; CHECK-NEXT: v_writelane_b32 v8, s14, 7 +; CHECK-NEXT: v_writelane_b32 v8, s15, 8 +; CHECK-NEXT: v_writelane_b32 v8, s16, 9 +; CHECK-NEXT: v_writelane_b32 v8, s17, 10 +; CHECK-NEXT: v_writelane_b32 v8, s18, 11 +; CHECK-NEXT: v_writelane_b32 v8, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v10 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v14 +; CHECK-NEXT: v_mov_b32_e32 v0, v15 +; CHECK-NEXT: v_mov_b32_e32 v1, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: v_writelane_b32 v8, s4, 13 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s4, v8, 13 +; CHECK-NEXT: v_readlane_b32 s8, v8, 5 +; CHECK-NEXT: v_readlane_b32 s9, v8, 6 +; CHECK-NEXT: v_readlane_b32 s10, v8, 7 +; CHECK-NEXT: v_readlane_b32 s11, v8, 8 +; CHECK-NEXT: v_readlane_b32 s12, v8, 9 +; CHECK-NEXT: v_readlane_b32 s13, v8, 10 +; CHECK-NEXT: v_readlane_b32 s14, v8, 11 +; CHECK-NEXT: v_readlane_b32 s15, v8, 12 +; CHECK-NEXT: v_readlane_b32 s16, v8, 0 +; CHECK-NEXT: v_readlane_b32 s17, v8, 1 +; CHECK-NEXT: v_readlane_b32 s18, v8, 2 +; CHECK-NEXT: v_readlane_b32 s19, v8, 3 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 -; CHECK-NEXT: v_readlane_b32 s16, v2, 0 -; CHECK-NEXT: v_readlane_b32 s17, v2, 1 -; CHECK-NEXT: v_readlane_b32 s18, v2, 2 -; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 +; CHECK-NEXT: v_readlane_b32 s4, v8, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 66ee43a7cee24..c7e5931c110a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -242,9 +242,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s16, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index f234847900e76..313baf1c32829 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -23,7 +23,6 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] -; FIXEDABI-NEXT: ; implicit-def: $vgpr40 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 ; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index d5caefda01d6b..3f711da775039 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1417,13 +1417,12 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 ; GCN-NEXT: v_writelane_b32 v2, s31, 1 @@ -1442,14 +1441,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1459,7 +1458,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr2 ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,14 +1471,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1490,7 +1488,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1504,14 +1501,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1521,7 +1518,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1534,7 +1530,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1542,7 +1538,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1552,7 +1548,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 @@ -1567,7 +1562,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1580,13 +1575,12 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v3, s30, 0 ; GCN-NEXT: v_writelane_b32 v3, s31, 1 @@ -1609,14 +1603,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1626,7 +1620,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr3 ; GFX7-NEXT: v_writelane_b32 v3, s30, 0 ; GFX7-NEXT: v_writelane_b32 v3, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,14 +1637,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1661,7 +1654,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1674,14 +1666,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1691,7 +1683,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1704,7 +1695,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1712,7 +1703,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1722,7 +1713,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 @@ -1737,7 +1727,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1750,13 +1740,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 ; GCN-NEXT: v_writelane_b32 v4, s31, 1 @@ -1780,14 +1769,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1797,7 +1786,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,14 +1804,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1833,9 +1821,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1850,14 +1837,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1870,9 +1857,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1886,7 +1872,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1894,7 +1880,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1906,9 +1892,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: ; implicit-def: $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1909,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1937,13 +1922,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 ; GCN-NEXT: v_writelane_b32 v5, s31, 1 @@ -1974,14 +1958,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1991,7 +1975,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr5 ; GFX7-NEXT: v_writelane_b32 v5, s30, 0 ; GFX7-NEXT: v_writelane_b32 v5, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2017,14 +2000,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2034,7 +2017,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr3 ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2058,14 +2040,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2075,7 +2057,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2094,7 +2075,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2102,7 +2083,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2112,7 +2093,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr3 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 @@ -2133,7 +2113,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2146,13 +2126,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr9 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v9, s30, 0 ; GCN-NEXT: v_writelane_b32 v9, s31, 1 @@ -2199,14 +2178,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2216,7 +2195,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr9 ; GFX7-NEXT: v_writelane_b32 v9, s30, 0 ; GFX7-NEXT: v_writelane_b32 v9, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2258,14 +2236,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2275,7 +2253,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr5 ; GFX8-NEXT: v_writelane_b32 v5, s30, 0 ; GFX8-NEXT: v_writelane_b32 v5, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2313,14 +2290,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2330,7 +2307,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2357,7 +2333,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2365,7 +2341,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2375,7 +2351,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr5 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 @@ -2404,7 +2379,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2417,13 +2392,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr17 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v17, s30, 0 ; GCN-NEXT: v_writelane_b32 v17, s31, 1 @@ -2502,14 +2476,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2519,7 +2493,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: ; implicit-def: $vgpr17 ; GFX7-NEXT: v_writelane_b32 v17, s30, 0 ; GFX7-NEXT: v_writelane_b32 v17, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2593,14 +2566,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2610,7 +2583,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr9 ; GFX8-NEXT: v_writelane_b32 v9, s30, 0 ; GFX8-NEXT: v_writelane_b32 v9, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2676,14 +2648,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2693,7 +2665,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2736,7 +2707,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2744,7 +2715,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2754,7 +2725,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr9 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 @@ -2799,7 +2769,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index f0b0afa02b1ac..2635edcb9d8a3 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -902,11 +902,6 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: ; implicit-def: $vgpr1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 @@ -983,6 +978,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 ; CHECK-NEXT: s_cmp_eq_u32 s31, 0 ; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index 6c78e9a4e9b7f..4d55d4974be74 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -8,12 +8,12 @@ @alias = hidden alias void (), ptr @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 32 +; GFX908: .amdhsa_next_free_vgpr 41 ; GFX908-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A: .amdhsa_next_free_vgpr 59 +; GFX90A: .amdhsa_next_free_vgpr 71 ; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 32 +; GFX90A-NEXT: .amdhsa_accum_offset 44 define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index 59eae79ca1224..5672cbb3f94f5 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,7 +9,7 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 41 +; CHECK: .amdhsa_next_free_vgpr 42 ; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel1() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 9451a062af6a3..cde7716ab7c18 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -178,7 +178,7 @@ declare hidden void @external_void_func_void() #0 ; restored. No FP is required. ; ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: -; GCN: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] @@ -189,7 +189,7 @@ declare hidden void @external_void_func_void() #0 ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -227,7 +227,6 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s42, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber s42 @@ -401,9 +400,8 @@ define void @realign_stack_no_fp_elim() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} @@ -441,11 +439,10 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: ; implicit-def: $vgpr48 ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword @@ -453,7 +450,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; FLATSCR: s_add_i32 s32, s32, 12{{$}} ; GCN: ;;#ASMSTART -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -488,7 +485,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-DAG: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill @@ -500,7 +497,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 @@ -594,7 +591,7 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s33, s2 +; FLATSCR: s_mov_b32 s33, s0 ; MUBUF: s_mov_b32 s33, s32 ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] @@ -635,14 +632,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; FLATSCR: s_mov_b32 s0, s33 ; FLATSCR: s_mov_b32 s33, s32 ; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] -; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR: s_mov_b64 exec, s[2:3] +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 ; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] +; FLATSCR: s_mov_b32 s33, s0 ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", @@ -675,7 +672,7 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 880a88ed9026d..449d57f09e68b 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -30,14 +30,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; ; GCN_DBG-LABEL: test_loop: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s10, -1 -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa @@ -46,20 +39,11 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec -; GCN_DBG-NEXT: s_mov_b64 exec, -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -81,9 +65,6 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_endpgm @@ -124,31 +105,16 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; ; GCN_DBG-LABEL: loop_const_true: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s10, -1 -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -170,9 +136,6 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -207,31 +170,16 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; ; GCN_DBG-LABEL: loop_const_false: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s10, -1 -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -253,9 +201,6 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -291,31 +236,16 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; ; GCN_DBG-LABEL: loop_const_undef: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s10, -1 -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -335,9 +265,6 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -387,14 +314,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; ; GCN_DBG-LABEL: loop_arg_0: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s10, -1 -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 @@ -411,19 +331,11 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 @@ -446,9 +358,6 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index ee9da868068af..bf87cea9089d4 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -19,14 +19,14 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -84,14 +84,14 @@ bb.outer.end: ; preds = %bb.outer.then, %bb. ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -161,7 +161,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -170,9 +170,6 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]] @@ -183,7 +180,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword @@ -263,9 +260,6 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[VGPR]] -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]] @@ -276,14 +270,14 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -293,7 +287,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -363,7 +357,7 @@ bb.outer.end: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -425,81 +419,61 @@ bb.end: ; preds = %bb.then, %bb ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN-O0: buffer_load_dword [[RESTORED_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0: buffer_load_dword +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0: buffer_load_dword [[RESTORED_1_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_1_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW2]]: -; GCN-O0: buffer_load_dword [[RESTORED_2_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] ; GCN-O0: s_branch [[FLOW:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW]]: -; GCN-O0: buffer_load_dword [[RESTORED_3_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_3_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]] ; GCN-O0: ; %bb.{{[0-9]+}}: -; GCN-O0: buffer_load_dword [[RESTORED_4_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_4_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: {{^}}[[FLOW3]]: ; GCN-O0-COUNT-4: buffer_load_dword -; GCN-O0: buffer_load_dword [[RESTORED_5_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-COUNT-2: s_mov_b64 -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-COUNT-4: buffer_store_dword ; GCN-O0: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 71dade0f278dc..a2f83301f2a1b 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 12{{$}} ; GCN: {{^}}; %bb.0: @@ -19,7 +19,7 @@ ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} ; Spill saved exec ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec @@ -82,13 +82,13 @@ endif: } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 20{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: ; GCN-DAG: s_mov_b32 m0, -1 ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -166,7 +166,7 @@ end: ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]] +; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] @@ -187,7 +187,6 @@ end: ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow -; VGPR: buffer_load_dword ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 28e82208f53ad..b3251e835b073 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -33,7 +33,6 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -72,7 +71,6 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -111,7 +109,6 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -150,7 +147,6 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir index 2a96f5eef8c9e..aed642d1f0670 100644 --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -19,11 +19,10 @@ body: | ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, killed $vgpr0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, killed $vgpr0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, killed $vgpr0 - ; CHECK-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 060641476d2f8..0807a567a412e 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -19,26 +19,25 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: ; implicit-def: $vgpr41 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s40, 8 +; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v40, s42, 10 +; CHECK-NEXT: v_writelane_b32 v40, s43, 11 +; CHECK-NEXT: v_writelane_b32 v40, s44, 12 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v41, s30, 0 -; CHECK-NEXT: v_writelane_b32 v41, s31, 1 -; CHECK-NEXT: v_writelane_b32 v41, s34, 2 -; CHECK-NEXT: v_writelane_b32 v41, s35, 3 -; CHECK-NEXT: v_writelane_b32 v41, s36, 4 -; CHECK-NEXT: v_writelane_b32 v41, s37, 5 -; CHECK-NEXT: v_writelane_b32 v41, s38, 6 -; CHECK-NEXT: v_writelane_b32 v41, s39, 7 -; CHECK-NEXT: v_writelane_b32 v41, s40, 8 -; CHECK-NEXT: v_writelane_b32 v41, s41, 9 -; CHECK-NEXT: v_writelane_b32 v41, s42, 10 -; CHECK-NEXT: v_writelane_b32 v41, s43, 11 -; CHECK-NEXT: v_writelane_b32 v41, s44, 12 -; CHECK-NEXT: v_writelane_b32 v41, s45, 13 -; CHECK-NEXT: v_writelane_b32 v41, s46, 14 +; CHECK-NEXT: v_writelane_b32 v40, s45, 13 +; CHECK-NEXT: v_writelane_b32 v40, s46, 14 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: @@ -46,12 +45,12 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v41, s47, 15 +; CHECK-NEXT: v_writelane_b32 v40, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v42, s16, 0 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v40, v31 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v41, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -69,33 +68,33 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s47, v41, 15 -; CHECK-NEXT: v_readlane_b32 s46, v41, 14 -; CHECK-NEXT: v_readlane_b32 s45, v41, 13 -; CHECK-NEXT: v_readlane_b32 s44, v41, 12 -; CHECK-NEXT: v_readlane_b32 s43, v41, 11 -; CHECK-NEXT: v_readlane_b32 s42, v41, 10 -; CHECK-NEXT: v_readlane_b32 s41, v41, 9 -; CHECK-NEXT: v_readlane_b32 s40, v41, 8 -; CHECK-NEXT: v_readlane_b32 s39, v41, 7 -; CHECK-NEXT: v_readlane_b32 s38, v41, 6 -; CHECK-NEXT: v_readlane_b32 s37, v41, 5 -; CHECK-NEXT: v_readlane_b32 s36, v41, 4 -; CHECK-NEXT: v_readlane_b32 s35, v41, 3 -; CHECK-NEXT: v_readlane_b32 s34, v41, 2 -; CHECK-NEXT: v_readlane_b32 s31, v41, 1 -; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: v_readlane_b32 s47, v40, 15 +; CHECK-NEXT: v_readlane_b32 s46, v40, 14 +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 +; CHECK-NEXT: v_readlane_b32 s42, v40, 10 +; CHECK-NEXT: v_readlane_b32 s41, v40, 9 +; CHECK-NEXT: v_readlane_b32 s40, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: v_readlane_b32 s4, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll index dd8b39d11071d..97383490841e5 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -25,7 +25,6 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: ; implicit-def: $vgpr42 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s30, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 45d685d7664e1..f69602bdc39ba 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -117,15 +117,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s104, exec_lo +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s2, 0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 4 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s3, 1 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v72, off, s4 +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s104 ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -222,14 +228,22 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v2, s3 +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v2, 0 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v2, 1 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm @@ -237,15 +251,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s104, exec_lo +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s2, 0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 4 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s3, 1 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v72, off, s4 +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s104 ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -342,14 +362,22 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v2, s3 +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v2, 0 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v2, 1 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index f8c3629764ea7..aca1351de3ef9 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,13 +12,14 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: S_NOP 0, implicit-def $exec_lo + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -37,13 +38,14 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: S_NOP 0, implicit-def $exec_hi + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -62,16 +64,17 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: S_NOP 0, implicit-def $exec + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -93,12 +96,13 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -116,12 +120,13 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -139,15 +144,16 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir index 5732e43b3c423..e8688d8f55d08 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,13 +13,14 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: S_NOP 0, implicit-def $m0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -43,12 +44,13 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll index 52eab573ea44a..e7cab2606aa8a 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -16,7 +16,6 @@ define void @callee_with_stack_and_call() #0 { ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] -; SPILL-TO-VGPR-NEXT: ; implicit-def: $vgpr40 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index 1a4577ea2e1cf..7c9d01db9c2c0 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,13 +8,11 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s38, s33 +; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] -; SDAG-NEXT: ; implicit-def: $vgpr40 -; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 @@ -35,6 +33,7 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 +; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 ; SDAG-NEXT: v_writelane_b32 v40, s25, 21 ; SDAG-NEXT: s_getpc_b64 s[34:35] @@ -82,20 +81,18 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s38 +; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s38, s33 +; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] -; GISEL-NEXT: ; implicit-def: $vgpr40 -; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 @@ -116,6 +113,7 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 +; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 ; GISEL-NEXT: v_writelane_b32 v40, s25, 21 ; GISEL-NEXT: s_getpc_b64 s[34:35] @@ -163,7 +161,7 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s38 +; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index f0324ae9ff583..8da0dc3c0e0ee 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -103,7 +103,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -137,16 +136,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -173,18 +171,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -209,16 +205,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -249,7 +244,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -257,7 +251,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -286,15 +279,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -325,15 +316,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -363,15 +352,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -405,7 +392,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -413,7 +399,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -442,15 +427,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -481,15 +464,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -519,15 +500,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -559,7 +538,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -592,9 +570,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -627,9 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -662,9 +638,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -701,7 +676,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -735,9 +709,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 @@ -771,9 +744,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 @@ -807,9 +779,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 @@ -846,7 +817,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -880,9 +850,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 @@ -916,9 +885,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 @@ -952,9 +920,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 @@ -989,7 +956,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1022,9 +988,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -1057,9 +1022,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -1092,9 +1056,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -1131,7 +1094,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1165,9 +1127,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 @@ -1201,9 +1162,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 @@ -1237,9 +1197,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 @@ -1276,7 +1235,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1310,9 +1268,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 @@ -1346,9 +1303,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 @@ -1382,9 +1338,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 @@ -1419,7 +1374,6 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 @@ -1452,9 +1406,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -1487,9 +1440,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -1522,9 +1474,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -1559,7 +1510,6 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1593,16 +1543,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1629,17 +1578,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -1664,16 +1612,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1705,7 +1652,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1739,9 +1685,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1776,9 +1721,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1813,9 +1757,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1851,7 +1794,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -1887,9 +1829,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -1925,9 +1866,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -1961,9 +1901,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -2004,7 +1943,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 @@ -2040,9 +1978,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2079,9 +2016,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2116,9 +2052,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2161,7 +2096,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 @@ -2199,9 +2133,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2240,18 +2173,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2278,9 +2210,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2321,7 +2252,6 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -2354,9 +2284,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2389,9 +2318,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2424,9 +2352,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2461,7 +2388,6 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 @@ -2494,9 +2420,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2529,9 +2454,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2564,9 +2488,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2601,7 +2524,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2635,16 +2557,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -2671,17 +2592,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2706,16 +2626,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -2744,7 +2663,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2779,9 +2697,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -2816,18 +2733,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2852,9 +2768,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -2891,7 +2806,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2928,9 +2842,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 @@ -2967,9 +2880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -3004,9 +2916,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 @@ -3045,7 +2956,6 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3079,16 +2989,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -3115,17 +3024,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -3150,16 +3058,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -3188,7 +3095,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3224,9 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3262,9 +3167,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -3298,9 +3202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3338,7 +3241,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3376,9 +3278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3416,9 +3317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -3453,9 +3353,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3496,7 +3395,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3529,9 +3427,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 @@ -3564,9 +3461,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 @@ -3599,9 +3495,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 @@ -3637,7 +3532,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3670,9 +3564,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3705,9 +3598,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3740,9 +3632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3778,7 +3669,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3811,9 +3701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3846,9 +3735,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3881,9 +3769,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3918,7 +3805,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -3952,16 +3838,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -3988,17 +3873,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4023,16 +3907,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4061,7 +3944,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 @@ -4095,16 +3977,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4131,18 +4012,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4167,16 +4047,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4206,7 +4085,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4239,9 +4117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4274,9 +4151,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4309,9 +4185,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4346,7 +4221,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -4380,16 +4254,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4416,18 +4289,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4452,16 +4324,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4491,7 +4362,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4524,9 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 @@ -4559,9 +4428,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 @@ -4594,9 +4462,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 @@ -4632,7 +4499,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4665,9 +4531,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4700,9 +4565,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4735,9 +4599,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4772,7 +4635,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -4806,16 +4668,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4842,17 +4703,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4877,16 +4737,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4915,7 +4774,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -4950,9 +4808,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -4987,18 +4844,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -5023,9 +4879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -5062,7 +4917,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -5098,9 +4952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 @@ -5136,9 +4989,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -5172,9 +5024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 @@ -5213,7 +5064,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -5246,9 +5096,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -5281,9 +5130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -5316,9 +5164,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -5353,7 +5200,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5389,9 +5235,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5427,9 +5272,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -5463,9 +5307,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5503,7 +5346,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5540,9 +5382,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5579,9 +5420,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -5616,9 +5456,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5660,7 +5499,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5697,9 +5535,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] @@ -5737,9 +5574,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] @@ -5777,9 +5613,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] @@ -5818,7 +5653,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5858,9 +5692,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5900,9 +5733,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 @@ -5938,9 +5770,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5985,7 +5816,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6024,9 +5854,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35] @@ -6066,9 +5895,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[0:1] @@ -6108,9 +5936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -6154,7 +5981,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6198,9 +6024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[34:35] @@ -6244,9 +6069,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 ; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] @@ -6290,9 +6114,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] @@ -6341,7 +6164,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] @@ -6387,9 +6209,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -6436,9 +6257,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 @@ -6484,9 +6304,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -6532,32 +6351,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_writelane_b32 v43, s34, 0 -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: global_store_dword v[41:42], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 -; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s34, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -6572,35 +6390,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr42 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v40, v0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v43, s34, 0 -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 -; GFX10-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 -; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -6617,35 +6434,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr42 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 -; GFX11-NEXT: v_writelane_b32 v42, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v43, s0, 0 -; GFX11-NEXT: v_writelane_b32 v42, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: global_store_b32 v[40:41], v0, off dlc +; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v43, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 @@ -6660,35 +6476,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr42 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 @@ -6714,7 +6529,6 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6751,9 +6565,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] @@ -6791,9 +6604,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] @@ -6831,9 +6643,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] @@ -6873,7 +6684,6 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -6911,17 +6721,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -6949,20 +6758,19 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -6989,17 +6797,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -7036,7 +6843,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -7081,9 +6887,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 @@ -7135,11 +6940,10 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, vcc_lo :: v_dual_mov_b32 v1, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7183,10 +6987,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, vcc_lo ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -7242,7 +7045,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -7297,9 +7099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -7355,9 +7156,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] @@ -7409,9 +7209,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -7462,7 +7261,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7470,8 +7269,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 @@ -7502,6 +7299,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s59, 27 ; GFX9-NEXT: v_writelane_b32 v40, s60, 28 ; GFX9-NEXT: v_writelane_b32 v40, s61, 29 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s62, 30 ; GFX9-NEXT: v_writelane_b32 v40, s63, 31 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -7548,7 +7346,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7556,7 +7354,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7566,9 +7364,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 @@ -7645,7 +7442,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7653,7 +7450,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s5, s33 +; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7661,9 +7458,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v31, off, s33 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7737,7 +7533,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s5 +; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7745,7 +7541,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7754,9 +7550,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7831,7 +7626,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -7851,7 +7646,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -7885,16 +7679,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -7921,18 +7714,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -7957,16 +7748,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -7995,9 +7785,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -8030,10 +7819,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 @@ -8067,10 +7855,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 @@ -8104,10 +7891,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 @@ -8143,9 +7929,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -8178,10 +7963,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 @@ -8215,10 +7999,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 @@ -8252,10 +8035,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 @@ -8291,9 +8073,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -8326,10 +8107,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 @@ -8363,10 +8143,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 @@ -8400,10 +8179,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 @@ -8439,10 +8217,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 @@ -8477,10 +8254,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 @@ -8517,10 +8293,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 @@ -8557,10 +8332,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 @@ -8599,10 +8373,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -8641,9 +8414,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8685,9 +8457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -8729,9 +8500,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -8776,12 +8546,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -8820,10 +8589,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8866,10 +8634,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8912,10 +8679,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8960,10 +8726,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -9008,9 +8773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -9058,9 +8822,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -9108,9 +8871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -9163,11 +8925,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -9217,9 +8978,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -9273,9 +9033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -9329,9 +9088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -9389,9 +9147,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -9424,10 +9181,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 @@ -9461,10 +9217,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 @@ -9498,10 +9253,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 @@ -9537,9 +9291,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -9572,10 +9325,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 @@ -9609,10 +9361,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 @@ -9646,10 +9397,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 @@ -9685,10 +9435,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -9723,10 +9472,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9763,10 +9511,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9803,10 +9550,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9845,11 +9591,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -9886,10 +9631,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9929,10 +9673,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9972,10 +9715,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 @@ -10017,13 +9759,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -10064,10 +9805,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 @@ -10113,10 +9853,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 @@ -10162,10 +9901,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 @@ -10213,10 +9951,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 @@ -10251,10 +9988,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 @@ -10291,10 +10027,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 @@ -10331,10 +10066,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 @@ -10373,12 +10107,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -10417,10 +10150,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10463,10 +10195,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10509,10 +10240,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10557,14 +10287,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -10607,10 +10336,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10659,10 +10387,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10711,10 +10438,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10765,10 +10491,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -10800,10 +10525,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10837,10 +10561,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10874,10 +10597,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10914,11 +10636,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -10951,9 +10672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -10990,9 +10710,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -11029,9 +10748,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -11071,11 +10789,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -11108,9 +10825,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -11147,9 +10863,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -11186,9 +10901,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -11228,10 +10942,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 @@ -11266,10 +10979,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11306,10 +11018,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11346,10 +11057,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11388,10 +11098,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 @@ -11426,10 +11135,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11466,10 +11174,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11506,10 +11213,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11548,11 +11254,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -11585,9 +11290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -11624,9 +11328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -11663,9 +11366,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -11705,10 +11407,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 @@ -11743,10 +11444,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11783,10 +11483,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11823,10 +11522,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11865,10 +11563,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -11900,10 +11597,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11937,10 +11633,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11974,10 +11669,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 @@ -12014,11 +11708,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -12051,9 +11744,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -12090,9 +11782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -12129,9 +11820,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -12171,10 +11861,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -12209,10 +11898,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -12249,10 +11937,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 @@ -12289,10 +11976,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 @@ -12331,11 +12017,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -12372,10 +12057,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12415,10 +12099,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12458,10 +12141,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12503,12 +12185,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -12547,10 +12228,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12593,10 +12273,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12639,10 +12318,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12687,13 +12365,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 @@ -12728,9 +12405,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12771,9 +12447,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12814,9 +12489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12860,12 +12534,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -12904,10 +12577,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12950,10 +12622,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12996,10 +12667,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 @@ -13044,13 +12714,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -13091,10 +12760,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 @@ -13140,10 +12808,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 @@ -13189,10 +12856,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 @@ -13240,10 +12906,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -13291,9 +12956,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -13344,9 +13008,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -13397,9 +13060,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -13454,8 +13116,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13464,6 +13124,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -13510,10 +13171,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13568,10 +13228,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13626,10 +13285,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13686,8 +13344,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13698,6 +13354,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 @@ -13753,9 +13410,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -13822,9 +13478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -13891,9 +13546,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -13964,8 +13618,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13981,6 +13633,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 @@ -14075,9 +13728,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -14189,9 +13841,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14297,9 +13948,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14411,8 +14061,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -14427,6 +14075,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 @@ -14527,9 +14176,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -14646,9 +14294,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14757,9 +14404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14879,7 +14525,6 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -14918,9 +14563,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 @@ -14957,9 +14601,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 @@ -14993,9 +14636,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 @@ -15038,7 +14680,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15109,12 +14750,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -15181,9 +14821,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 @@ -15236,9 +14875,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -15331,7 +14969,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15408,7 +15045,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 -; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -15484,11 +15120,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 ; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1 @@ -15545,10 +15180,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -15637,7 +15271,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15714,7 +15347,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 -; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -15794,12 +15426,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 @@ -15857,10 +15488,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index 4eaca1701ea90..3d95bd3b6c0c1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -15,10 +15,9 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -54,9 +53,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] @@ -95,9 +93,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] @@ -137,7 +134,6 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s28, 0 ; GFX9-NEXT: v_writelane_b32 v0, s29, 1 ; GFX9-NEXT: v_writelane_b32 v0, s30, 2 @@ -166,7 +162,6 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s28, 0 ; GFX10-NEXT: v_writelane_b32 v0, s29, 1 ; GFX10-NEXT: v_writelane_b32 v0, s30, 2 @@ -193,10 +188,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 @@ -212,9 +206,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -238,9 +232,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -280,9 +273,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -324,9 +316,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -368,33 +359,32 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v41, 1 -; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 @@ -409,35 +399,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr41 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v31 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_mov_b32_e32 v31, v40 +; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v41, 1 -; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -454,35 +443,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_mov_b32_e32 v40, v31 -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_mov_b32_e32 v41, v31 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_mov_b32_e32 v31, v40 +; GFX11-NEXT: v_mov_b32_e32 v31, v41 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v41, 1 -; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 @@ -506,9 +494,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -548,9 +535,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -558,8 +544,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -592,9 +578,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -602,8 +587,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 @@ -640,9 +625,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -682,9 +666,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 @@ -692,8 +675,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -726,9 +709,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -736,8 +718,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_mov_b32 s4, s34 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s34 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s34, s4 @@ -774,7 +756,6 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 @@ -814,18 +795,17 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr41 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -856,18 +836,18 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr41 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v40 @@ -895,18 +875,17 @@ define hidden void @void_func_void_clobber_s33() #1 { ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -914,20 +893,19 @@ define hidden void @void_func_void_clobber_s33() #1 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: ; implicit-def: $vgpr0 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -936,19 +914,18 @@ define hidden void @void_func_void_clobber_s33() #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -960,18 +937,17 @@ define hidden void @void_func_void_clobber_s34() #1 { ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -979,20 +955,19 @@ define hidden void @void_func_void_clobber_s34() #1 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: ; implicit-def: $vgpr0 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1001,19 +976,18 @@ define hidden void @void_func_void_clobber_s34() #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1031,7 +1005,6 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1063,9 +1036,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 @@ -1097,9 +1069,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 @@ -1133,7 +1104,6 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1165,9 +1135,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 @@ -1199,9 +1168,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 @@ -1235,9 +1203,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -1276,9 +1243,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr40 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -1286,8 +1252,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1319,9 +1285,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -1329,8 +1294,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1363,16 +1328,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v41, s31, 2 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -1380,7 +1344,7 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -1389,15 +1353,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ; use v41 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v41, 2 -; GFX9-NEXT: v_readlane_b32 s30, v41, 1 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 @@ -1412,43 +1376,42 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: ; implicit-def: $vgpr41 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s4, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v32 +; GFX10-NEXT: v_mov_b32_e32 v41, v32 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ; use v41 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v41, 2 -; GFX10-NEXT: v_readlane_b32 s30, v41, 1 -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -1465,42 +1428,41 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s4, 0 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s40 -; GFX11-NEXT: v_writelane_b32 v41, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_mov_b32_e32 v40, v32 +; GFX11-NEXT: v_mov_b32_e32 v41, v32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v41, s31, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v40 +; GFX11-NEXT: ; use v41 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v41, 2 -; GFX11-NEXT: v_readlane_b32 s30, v41, 1 -; GFX11-NEXT: v_readlane_b32 s4, v41, 0 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index b55d65b0a7a39..a67a44971b647 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -34,7 +34,6 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX9-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -63,7 +62,6 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -84,7 +82,7 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -93,7 +91,6 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i1@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -106,7 +103,7 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -145,7 +142,6 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX9-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +170,6 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -195,7 +190,7 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -204,7 +199,6 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i16@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -217,7 +211,7 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -256,7 +250,6 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX9-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -285,7 +278,6 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -306,7 +298,7 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -315,7 +307,6 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -328,7 +319,7 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -376,7 +367,6 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX9-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +395,6 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 @@ -426,7 +415,7 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill @@ -435,7 +424,6 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr2 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 @@ -448,7 +436,7 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -1661,9 +1649,8 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX9-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1692,10 +1679,9 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1725,10 +1711,9 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_512xi32@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_512xi32@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v0, s33 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v5, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, s33 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index cc8d85c85b0b6..5040e5348aa14 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -401,7 +401,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -488,7 +487,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -579,7 +577,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -669,7 +666,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -761,7 +757,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -850,7 +845,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -944,7 +938,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1042,7 +1035,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1149,7 +1141,6 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1244,7 +1235,6 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1337,198 +1327,196 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: v_writelane_b32 v41, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s31, 1 -; GCN-NEXT: v_writelane_b32 v41, s34, 2 -; GCN-NEXT: v_writelane_b32 v41, s35, 3 -; GCN-NEXT: v_writelane_b32 v41, s36, 4 -; GCN-NEXT: v_writelane_b32 v41, s37, 5 -; GCN-NEXT: v_writelane_b32 v41, s38, 6 -; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s40, 8 -; GCN-NEXT: v_writelane_b32 v41, s41, 9 -; GCN-NEXT: v_writelane_b32 v41, s42, 10 -; GCN-NEXT: v_writelane_b32 v41, s43, 11 -; GCN-NEXT: v_writelane_b32 v41, s44, 12 -; GCN-NEXT: v_writelane_b32 v41, s45, 13 -; GCN-NEXT: v_writelane_b32 v41, s46, 14 -; GCN-NEXT: v_writelane_b32 v41, s47, 15 -; GCN-NEXT: v_writelane_b32 v41, s48, 16 -; GCN-NEXT: v_writelane_b32 v41, s49, 17 -; GCN-NEXT: v_writelane_b32 v41, s50, 18 -; GCN-NEXT: v_writelane_b32 v41, s51, 19 -; GCN-NEXT: v_writelane_b32 v41, s52, 20 -; GCN-NEXT: v_writelane_b32 v41, s53, 21 -; GCN-NEXT: v_writelane_b32 v41, s54, 22 -; GCN-NEXT: v_writelane_b32 v41, s55, 23 -; GCN-NEXT: v_writelane_b32 v41, s56, 24 -; GCN-NEXT: v_writelane_b32 v41, s57, 25 -; GCN-NEXT: v_writelane_b32 v41, s58, 26 -; GCN-NEXT: v_writelane_b32 v41, s59, 27 -; GCN-NEXT: v_writelane_b32 v41, s60, 28 -; GCN-NEXT: v_writelane_b32 v41, s61, 29 -; GCN-NEXT: v_writelane_b32 v41, s62, 30 -; GCN-NEXT: v_writelane_b32 v41, s63, 31 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s36, 4 +; GCN-NEXT: v_writelane_b32 v40, s37, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 6 +; GCN-NEXT: v_writelane_b32 v40, s39, 7 +; GCN-NEXT: v_writelane_b32 v40, s40, 8 +; GCN-NEXT: v_writelane_b32 v40, s41, 9 +; GCN-NEXT: v_writelane_b32 v40, s42, 10 +; GCN-NEXT: v_writelane_b32 v40, s43, 11 +; GCN-NEXT: v_writelane_b32 v40, s44, 12 +; GCN-NEXT: v_writelane_b32 v40, s45, 13 +; GCN-NEXT: v_writelane_b32 v40, s46, 14 +; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 16 +; GCN-NEXT: v_writelane_b32 v40, s49, 17 +; GCN-NEXT: v_writelane_b32 v40, s50, 18 +; GCN-NEXT: v_writelane_b32 v40, s51, 19 +; GCN-NEXT: v_writelane_b32 v40, s52, 20 +; GCN-NEXT: v_writelane_b32 v40, s53, 21 +; GCN-NEXT: v_writelane_b32 v40, s54, 22 +; GCN-NEXT: v_writelane_b32 v40, s55, 23 +; GCN-NEXT: v_writelane_b32 v40, s56, 24 +; GCN-NEXT: v_writelane_b32 v40, s57, 25 +; GCN-NEXT: v_writelane_b32 v40, s58, 26 +; GCN-NEXT: v_writelane_b32 v40, s59, 27 +; GCN-NEXT: v_writelane_b32 v40, s60, 28 +; GCN-NEXT: v_writelane_b32 v40, s61, 29 +; GCN-NEXT: v_writelane_b32 v40, s62, 30 +; GCN-NEXT: v_writelane_b32 v40, s63, 31 +; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v2 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_mov_b32_e32 v0, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: v_readlane_b32 s63, v41, 31 -; GCN-NEXT: v_readlane_b32 s62, v41, 30 -; GCN-NEXT: v_readlane_b32 s61, v41, 29 -; GCN-NEXT: v_readlane_b32 s60, v41, 28 -; GCN-NEXT: v_readlane_b32 s59, v41, 27 -; GCN-NEXT: v_readlane_b32 s58, v41, 26 -; GCN-NEXT: v_readlane_b32 s57, v41, 25 -; GCN-NEXT: v_readlane_b32 s56, v41, 24 -; GCN-NEXT: v_readlane_b32 s55, v41, 23 -; GCN-NEXT: v_readlane_b32 s54, v41, 22 -; GCN-NEXT: v_readlane_b32 s53, v41, 21 -; GCN-NEXT: v_readlane_b32 s52, v41, 20 -; GCN-NEXT: v_readlane_b32 s51, v41, 19 -; GCN-NEXT: v_readlane_b32 s50, v41, 18 -; GCN-NEXT: v_readlane_b32 s49, v41, 17 -; GCN-NEXT: v_readlane_b32 s48, v41, 16 -; GCN-NEXT: v_readlane_b32 s47, v41, 15 -; GCN-NEXT: v_readlane_b32 s46, v41, 14 -; GCN-NEXT: v_readlane_b32 s45, v41, 13 -; GCN-NEXT: v_readlane_b32 s44, v41, 12 -; GCN-NEXT: v_readlane_b32 s43, v41, 11 -; GCN-NEXT: v_readlane_b32 s42, v41, 10 -; GCN-NEXT: v_readlane_b32 s41, v41, 9 -; GCN-NEXT: v_readlane_b32 s40, v41, 8 -; GCN-NEXT: v_readlane_b32 s39, v41, 7 -; GCN-NEXT: v_readlane_b32 s38, v41, 6 -; GCN-NEXT: v_readlane_b32 s37, v41, 5 -; GCN-NEXT: v_readlane_b32 s36, v41, 4 -; GCN-NEXT: v_readlane_b32 s35, v41, 3 -; GCN-NEXT: v_readlane_b32 s34, v41, 2 -; GCN-NEXT: v_readlane_b32 s31, v41, 1 -; GCN-NEXT: v_readlane_b32 s30, v41, 0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_readlane_b32 s63, v40, 31 +; GCN-NEXT: v_readlane_b32 s62, v40, 30 +; GCN-NEXT: v_readlane_b32 s61, v40, 29 +; GCN-NEXT: v_readlane_b32 s60, v40, 28 +; GCN-NEXT: v_readlane_b32 s59, v40, 27 +; GCN-NEXT: v_readlane_b32 s58, v40, 26 +; GCN-NEXT: v_readlane_b32 s57, v40, 25 +; GCN-NEXT: v_readlane_b32 s56, v40, 24 +; GCN-NEXT: v_readlane_b32 s55, v40, 23 +; GCN-NEXT: v_readlane_b32 s54, v40, 22 +; GCN-NEXT: v_readlane_b32 s53, v40, 21 +; GCN-NEXT: v_readlane_b32 s52, v40, 20 +; GCN-NEXT: v_readlane_b32 s51, v40, 19 +; GCN-NEXT: v_readlane_b32 s50, v40, 18 +; GCN-NEXT: v_readlane_b32 s49, v40, 17 +; GCN-NEXT: v_readlane_b32 s48, v40, 16 +; GCN-NEXT: v_readlane_b32 s47, v40, 15 +; GCN-NEXT: v_readlane_b32 s46, v40, 14 +; GCN-NEXT: v_readlane_b32 s45, v40, 13 +; GCN-NEXT: v_readlane_b32 s44, v40, 12 +; GCN-NEXT: v_readlane_b32 s43, v40, 11 +; GCN-NEXT: v_readlane_b32 s42, v40, 10 +; GCN-NEXT: v_readlane_b32 s41, v40, 9 +; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s39, v40, 7 +; GCN-NEXT: v_readlane_b32 s38, v40, 6 +; GCN-NEXT: v_readlane_b32 s37, v40, 5 +; GCN-NEXT: v_readlane_b32 s36, v40, 4 +; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: ; implicit-def: $vgpr41 -; GISEL-NEXT: v_writelane_b32 v41, s30, 0 -; GISEL-NEXT: v_writelane_b32 v41, s31, 1 -; GISEL-NEXT: v_writelane_b32 v41, s34, 2 -; GISEL-NEXT: v_writelane_b32 v41, s35, 3 -; GISEL-NEXT: v_writelane_b32 v41, s36, 4 -; GISEL-NEXT: v_writelane_b32 v41, s37, 5 -; GISEL-NEXT: v_writelane_b32 v41, s38, 6 -; GISEL-NEXT: v_writelane_b32 v41, s39, 7 -; GISEL-NEXT: v_writelane_b32 v41, s40, 8 -; GISEL-NEXT: v_writelane_b32 v41, s41, 9 -; GISEL-NEXT: v_writelane_b32 v41, s42, 10 -; GISEL-NEXT: v_writelane_b32 v41, s43, 11 -; GISEL-NEXT: v_writelane_b32 v41, s44, 12 -; GISEL-NEXT: v_writelane_b32 v41, s45, 13 -; GISEL-NEXT: v_writelane_b32 v41, s46, 14 -; GISEL-NEXT: v_writelane_b32 v41, s47, 15 -; GISEL-NEXT: v_writelane_b32 v41, s48, 16 -; GISEL-NEXT: v_writelane_b32 v41, s49, 17 -; GISEL-NEXT: v_writelane_b32 v41, s50, 18 -; GISEL-NEXT: v_writelane_b32 v41, s51, 19 -; GISEL-NEXT: v_writelane_b32 v41, s52, 20 -; GISEL-NEXT: v_writelane_b32 v41, s53, 21 -; GISEL-NEXT: v_writelane_b32 v41, s54, 22 -; GISEL-NEXT: v_writelane_b32 v41, s55, 23 -; GISEL-NEXT: v_writelane_b32 v41, s56, 24 -; GISEL-NEXT: v_writelane_b32 v41, s57, 25 -; GISEL-NEXT: v_writelane_b32 v41, s58, 26 -; GISEL-NEXT: v_writelane_b32 v41, s59, 27 -; GISEL-NEXT: v_writelane_b32 v41, s60, 28 -; GISEL-NEXT: v_writelane_b32 v41, s61, 29 -; GISEL-NEXT: v_writelane_b32 v41, s62, 30 -; GISEL-NEXT: v_writelane_b32 v41, s63, 31 -; GISEL-NEXT: v_mov_b32_e32 v40, v0 +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: v_writelane_b32 v40, s30, 0 +; GISEL-NEXT: v_writelane_b32 v40, s31, 1 +; GISEL-NEXT: v_writelane_b32 v40, s34, 2 +; GISEL-NEXT: v_writelane_b32 v40, s35, 3 +; GISEL-NEXT: v_writelane_b32 v40, s36, 4 +; GISEL-NEXT: v_writelane_b32 v40, s37, 5 +; GISEL-NEXT: v_writelane_b32 v40, s38, 6 +; GISEL-NEXT: v_writelane_b32 v40, s39, 7 +; GISEL-NEXT: v_writelane_b32 v40, s40, 8 +; GISEL-NEXT: v_writelane_b32 v40, s41, 9 +; GISEL-NEXT: v_writelane_b32 v40, s42, 10 +; GISEL-NEXT: v_writelane_b32 v40, s43, 11 +; GISEL-NEXT: v_writelane_b32 v40, s44, 12 +; GISEL-NEXT: v_writelane_b32 v40, s45, 13 +; GISEL-NEXT: v_writelane_b32 v40, s46, 14 +; GISEL-NEXT: v_writelane_b32 v40, s47, 15 +; GISEL-NEXT: v_writelane_b32 v40, s48, 16 +; GISEL-NEXT: v_writelane_b32 v40, s49, 17 +; GISEL-NEXT: v_writelane_b32 v40, s50, 18 +; GISEL-NEXT: v_writelane_b32 v40, s51, 19 +; GISEL-NEXT: v_writelane_b32 v40, s52, 20 +; GISEL-NEXT: v_writelane_b32 v40, s53, 21 +; GISEL-NEXT: v_writelane_b32 v40, s54, 22 +; GISEL-NEXT: v_writelane_b32 v40, s55, 23 +; GISEL-NEXT: v_writelane_b32 v40, s56, 24 +; GISEL-NEXT: v_writelane_b32 v40, s57, 25 +; GISEL-NEXT: v_writelane_b32 v40, s58, 26 +; GISEL-NEXT: v_writelane_b32 v40, s59, 27 +; GISEL-NEXT: v_writelane_b32 v40, s60, 28 +; GISEL-NEXT: v_writelane_b32 v40, s61, 29 +; GISEL-NEXT: v_writelane_b32 v40, s62, 30 +; GISEL-NEXT: v_writelane_b32 v40, s63, 31 +; GISEL-NEXT: v_mov_b32_e32 v41, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v40 +; GISEL-NEXT: v_mov_b32_e32 v0, v41 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB7_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v0, v40 -; GISEL-NEXT: v_readlane_b32 s63, v41, 31 -; GISEL-NEXT: v_readlane_b32 s62, v41, 30 -; GISEL-NEXT: v_readlane_b32 s61, v41, 29 -; GISEL-NEXT: v_readlane_b32 s60, v41, 28 -; GISEL-NEXT: v_readlane_b32 s59, v41, 27 -; GISEL-NEXT: v_readlane_b32 s58, v41, 26 -; GISEL-NEXT: v_readlane_b32 s57, v41, 25 -; GISEL-NEXT: v_readlane_b32 s56, v41, 24 -; GISEL-NEXT: v_readlane_b32 s55, v41, 23 -; GISEL-NEXT: v_readlane_b32 s54, v41, 22 -; GISEL-NEXT: v_readlane_b32 s53, v41, 21 -; GISEL-NEXT: v_readlane_b32 s52, v41, 20 -; GISEL-NEXT: v_readlane_b32 s51, v41, 19 -; GISEL-NEXT: v_readlane_b32 s50, v41, 18 -; GISEL-NEXT: v_readlane_b32 s49, v41, 17 -; GISEL-NEXT: v_readlane_b32 s48, v41, 16 -; GISEL-NEXT: v_readlane_b32 s47, v41, 15 -; GISEL-NEXT: v_readlane_b32 s46, v41, 14 -; GISEL-NEXT: v_readlane_b32 s45, v41, 13 -; GISEL-NEXT: v_readlane_b32 s44, v41, 12 -; GISEL-NEXT: v_readlane_b32 s43, v41, 11 -; GISEL-NEXT: v_readlane_b32 s42, v41, 10 -; GISEL-NEXT: v_readlane_b32 s41, v41, 9 -; GISEL-NEXT: v_readlane_b32 s40, v41, 8 -; GISEL-NEXT: v_readlane_b32 s39, v41, 7 -; GISEL-NEXT: v_readlane_b32 s38, v41, 6 -; GISEL-NEXT: v_readlane_b32 s37, v41, 5 -; GISEL-NEXT: v_readlane_b32 s36, v41, 4 -; GISEL-NEXT: v_readlane_b32 s35, v41, 3 -; GISEL-NEXT: v_readlane_b32 s34, v41, 2 -; GISEL-NEXT: v_readlane_b32 s31, v41, 1 -; GISEL-NEXT: v_readlane_b32 s30, v41, 0 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: v_mov_b32_e32 v0, v41 +; GISEL-NEXT: v_readlane_b32 s63, v40, 31 +; GISEL-NEXT: v_readlane_b32 s62, v40, 30 +; GISEL-NEXT: v_readlane_b32 s61, v40, 29 +; GISEL-NEXT: v_readlane_b32 s60, v40, 28 +; GISEL-NEXT: v_readlane_b32 s59, v40, 27 +; GISEL-NEXT: v_readlane_b32 s58, v40, 26 +; GISEL-NEXT: v_readlane_b32 s57, v40, 25 +; GISEL-NEXT: v_readlane_b32 s56, v40, 24 +; GISEL-NEXT: v_readlane_b32 s55, v40, 23 +; GISEL-NEXT: v_readlane_b32 s54, v40, 22 +; GISEL-NEXT: v_readlane_b32 s53, v40, 21 +; GISEL-NEXT: v_readlane_b32 s52, v40, 20 +; GISEL-NEXT: v_readlane_b32 s51, v40, 19 +; GISEL-NEXT: v_readlane_b32 s50, v40, 18 +; GISEL-NEXT: v_readlane_b32 s49, v40, 17 +; GISEL-NEXT: v_readlane_b32 s48, v40, 16 +; GISEL-NEXT: v_readlane_b32 s47, v40, 15 +; GISEL-NEXT: v_readlane_b32 s46, v40, 14 +; GISEL-NEXT: v_readlane_b32 s45, v40, 13 +; GISEL-NEXT: v_readlane_b32 s44, v40, 12 +; GISEL-NEXT: v_readlane_b32 s43, v40, 11 +; GISEL-NEXT: v_readlane_b32 s42, v40, 10 +; GISEL-NEXT: v_readlane_b32 s41, v40, 9 +; GISEL-NEXT: v_readlane_b32 s40, v40, 8 +; GISEL-NEXT: v_readlane_b32 s39, v40, 7 +; GISEL-NEXT: v_readlane_b32 s38, v40, 6 +; GISEL-NEXT: v_readlane_b32 s37, v40, 5 +; GISEL-NEXT: v_readlane_b32 s36, v40, 4 +; GISEL-NEXT: v_readlane_b32 s35, v40, 3 +; GISEL-NEXT: v_readlane_b32 s34, v40, 2 +; GISEL-NEXT: v_readlane_b32 s31, v40, 1 +; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1543,13 +1531,12 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1633,20 +1620,19 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1730,7 +1716,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1742,13 +1728,12 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1829,20 +1814,19 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1923,7 +1907,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index f956251e4fb54..f196192f723f9 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,30 +13,17 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s12, s33, 0x100200 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v0, s8, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v40, s8, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND @@ -69,14 +56,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-NEXT: v_readlane_b32 s4, v40, 1 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 98641f302a6ab..8946846898f85 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3037,17 +3037,17 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s38, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s39, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s39, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff @@ -3172,13 +3172,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] @@ -3206,7 +3206,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6181,129 +6181,129 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6589,17 +6589,17 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 @@ -6613,12 +6613,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6647,10 +6647,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 @@ -6678,8 +6678,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 @@ -6923,123 +6923,123 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s15 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s13 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s3 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s9 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index f5ca18d721fbc..36ac9d212b684 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -231,10 +231,10 @@ entry: ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -251,7 +251,7 @@ entry: ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -264,18 +264,16 @@ entry: ; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] ; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 -; W64-O0: buffer_load_dword -; W64-O0: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] -; W64-O0: buffer_store_dword [[VSAVEEXEC]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -292,9 +290,7 @@ entry: ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_store_dword ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload -; W64-O0: buffer_load_dword ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -302,9 +298,8 @@ entry: ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: buffer_load_dword [[VSAVEEXEC1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload -; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX0]] -; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX1]] +; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] +; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index d5bcf685f9f04..6a14b88eb630e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -191,49 +191,48 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: v_writelane_b32 v44, s4, 0 -; GFX9-NEXT: v_writelane_b32 v42, s36, 3 +; GFX9-NEXT: v_writelane_b32 v40, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v42, s37, 4 +; GFX9-NEXT: v_writelane_b32 v40, s37, 4 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 ; GFX9-NEXT: s_mov_b32 s34, s15 -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v40 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v43 +; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 ; GFX9-NEXT: s_mov_b32 s15, s34 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v43 +; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 ; GFX9-NEXT: s_mov_b32 s15, s34 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s37, v42, 4 -; GFX9-NEXT: v_readlane_b32 s36, v42, 3 -; GFX9-NEXT: v_readlane_b32 s34, v42, 2 -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 -; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s37, v40, 4 +; GFX9-NEXT: v_readlane_b32 s36, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index f11959c53dc19..50c27d1835c9f 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,19 +27,15 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[12:13] +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 @@ -48,21 +44,17 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[12:13] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s14 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -99,22 +91,21 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s33, 0 +; CHECK-NEXT: v_writelane_b32 v1, s33, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 -; CHECK-NEXT: v_readlane_b32 s33, v0, 0 +; CHECK-NEXT: v_readlane_b32 s33, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_xor_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[4:5] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -161,13 +152,12 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s12, s33 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -184,7 +174,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s12 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -196,19 +186,14 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s13, s33 +; CHECK-NEXT: s_mov_b32 s7, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[14:15] +; CHECK-NEXT: v_writelane_b32 v2, s30, 0 +; CHECK-NEXT: v_writelane_b32 v2, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12 @@ -217,18 +202,13 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[14:15] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 -; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: v_readlane_b32 s31, v2, 1 +; CHECK-NEXT: v_readlane_b32 s30, v2, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s13 +; CHECK-NEXT: s_mov_b32 s33, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 7df0d10277388..c71938eb81882 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -15,21 +15,14 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s34, 0 -; CHECK-NEXT: v_writelane_b32 v40, s35, 1 -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -43,20 +36,13 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 -; CHECK-NEXT: v_readlane_b32 s34, v40, 0 -; CHECK-NEXT: v_readlane_b32 s35, v40, 1 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 -; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index bf29873ba2800..54229988f2ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,17 +11,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s94, -1 -; GCN-NEXT: s_mov_b32 s95, 0xe8f000 -; GCN-NEXT: s_add_u32 s92, s92, s3 -; GCN-NEXT: s_addc_u32 s93, s93, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -107,285 +100,264 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: v_writelane_b32 v0, s9, 61 ; GCN-NEXT: v_writelane_b32 v0, s10, 62 ; GCN-NEXT: v_writelane_b32 v0, s11, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-NEXT: v_writelane_b32 v0, s5, 9 -; GCN-NEXT: v_writelane_b32 v0, s6, 10 -; GCN-NEXT: v_writelane_b32 v0, s7, 11 -; GCN-NEXT: v_writelane_b32 v0, s8, 12 -; GCN-NEXT: v_writelane_b32 v0, s9, 13 -; GCN-NEXT: v_writelane_b32 v0, s10, 14 -; GCN-NEXT: v_writelane_b32 v0, s11, 15 +; GCN-NEXT: v_writelane_b32 v1, s4, 8 +; GCN-NEXT: v_writelane_b32 v1, s5, 9 +; GCN-NEXT: v_writelane_b32 v1, s6, 10 +; GCN-NEXT: v_writelane_b32 v1, s7, 11 +; GCN-NEXT: v_writelane_b32 v1, s8, 12 +; GCN-NEXT: v_writelane_b32 v1, s9, 13 +; GCN-NEXT: v_writelane_b32 v1, s10, 14 +; GCN-NEXT: v_writelane_b32 v1, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-NEXT: v_writelane_b32 v0, s6, 18 -; GCN-NEXT: v_writelane_b32 v0, s7, 19 -; GCN-NEXT: v_writelane_b32 v0, s8, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 -; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-NEXT: v_writelane_b32 v1, s8, 20 +; GCN-NEXT: v_writelane_b32 v1, s9, 21 +; GCN-NEXT: v_writelane_b32 v1, s10, 22 +; GCN-NEXT: v_writelane_b32 v1, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 24 -; GCN-NEXT: v_writelane_b32 v0, s5, 25 -; GCN-NEXT: v_writelane_b32 v0, s6, 26 -; GCN-NEXT: v_writelane_b32 v0, s7, 27 -; GCN-NEXT: v_writelane_b32 v0, s8, 28 -; GCN-NEXT: v_writelane_b32 v0, s9, 29 -; GCN-NEXT: v_writelane_b32 v0, s10, 30 -; GCN-NEXT: v_writelane_b32 v0, s11, 31 +; GCN-NEXT: v_writelane_b32 v1, s4, 24 +; GCN-NEXT: v_writelane_b32 v1, s5, 25 +; GCN-NEXT: v_writelane_b32 v1, s6, 26 +; GCN-NEXT: v_writelane_b32 v1, s7, 27 +; GCN-NEXT: v_writelane_b32 v1, s8, 28 +; GCN-NEXT: v_writelane_b32 v1, s9, 29 +; GCN-NEXT: v_writelane_b32 v1, s10, 30 +; GCN-NEXT: v_writelane_b32 v1, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 32 -; GCN-NEXT: v_writelane_b32 v0, s5, 33 -; GCN-NEXT: v_writelane_b32 v0, s6, 34 -; GCN-NEXT: v_writelane_b32 v0, s7, 35 -; GCN-NEXT: v_writelane_b32 v0, s8, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 -; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v1, s4, 32 +; GCN-NEXT: v_writelane_b32 v1, s5, 33 +; GCN-NEXT: v_writelane_b32 v1, s6, 34 +; GCN-NEXT: v_writelane_b32 v1, s7, 35 +; GCN-NEXT: v_writelane_b32 v1, s8, 36 +; GCN-NEXT: v_writelane_b32 v1, s9, 37 +; GCN-NEXT: v_writelane_b32 v1, s10, 38 +; GCN-NEXT: v_writelane_b32 v1, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 40 -; GCN-NEXT: v_writelane_b32 v0, s5, 41 -; GCN-NEXT: v_writelane_b32 v0, s6, 42 -; GCN-NEXT: v_writelane_b32 v0, s7, 43 -; GCN-NEXT: v_writelane_b32 v0, s8, 44 -; GCN-NEXT: v_writelane_b32 v0, s9, 45 -; GCN-NEXT: v_writelane_b32 v0, s10, 46 -; GCN-NEXT: v_writelane_b32 v0, s11, 47 +; GCN-NEXT: v_writelane_b32 v1, s4, 40 +; GCN-NEXT: v_writelane_b32 v1, s5, 41 +; GCN-NEXT: v_writelane_b32 v1, s6, 42 +; GCN-NEXT: v_writelane_b32 v1, s7, 43 +; GCN-NEXT: v_writelane_b32 v1, s8, 44 +; GCN-NEXT: v_writelane_b32 v1, s9, 45 +; GCN-NEXT: v_writelane_b32 v1, s10, 46 +; GCN-NEXT: v_writelane_b32 v1, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 48 -; GCN-NEXT: v_writelane_b32 v0, s5, 49 -; GCN-NEXT: v_writelane_b32 v0, s6, 50 -; GCN-NEXT: v_writelane_b32 v0, s7, 51 -; GCN-NEXT: v_writelane_b32 v0, s8, 52 -; GCN-NEXT: v_writelane_b32 v0, s9, 53 -; GCN-NEXT: v_writelane_b32 v0, s10, 54 -; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v1, s4, 48 +; GCN-NEXT: v_writelane_b32 v1, s5, 49 +; GCN-NEXT: v_writelane_b32 v1, s6, 50 +; GCN-NEXT: v_writelane_b32 v1, s7, 51 +; GCN-NEXT: v_writelane_b32 v1, s8, 52 +; GCN-NEXT: v_writelane_b32 v1, s9, 53 +; GCN-NEXT: v_writelane_b32 v1, s10, 54 +; GCN-NEXT: v_writelane_b32 v1, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 56 -; GCN-NEXT: v_writelane_b32 v0, s5, 57 -; GCN-NEXT: v_writelane_b32 v0, s6, 58 -; GCN-NEXT: v_writelane_b32 v0, s7, 59 -; GCN-NEXT: v_writelane_b32 v0, s8, 60 -; GCN-NEXT: v_writelane_b32 v0, s9, 61 -; GCN-NEXT: v_writelane_b32 v0, s10, 62 -; GCN-NEXT: v_writelane_b32 v0, s11, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v1, s4, 56 +; GCN-NEXT: v_writelane_b32 v1, s5, 57 +; GCN-NEXT: v_writelane_b32 v1, s6, 58 +; GCN-NEXT: v_writelane_b32 v1, s7, 59 +; GCN-NEXT: v_writelane_b32 v1, s8, 60 +; GCN-NEXT: v_writelane_b32 v1, s9, 61 +; GCN-NEXT: v_writelane_b32 v1, s10, 62 +; GCN-NEXT: v_writelane_b32 v1, s11, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s8, v2, 56 -; GCN-NEXT: v_readlane_b32 s9, v2, 57 -; GCN-NEXT: v_readlane_b32 s10, v2, 58 -; GCN-NEXT: v_readlane_b32 s11, v2, 59 -; GCN-NEXT: v_readlane_b32 s12, v2, 60 -; GCN-NEXT: v_readlane_b32 s13, v2, 61 -; GCN-NEXT: v_readlane_b32 s14, v2, 62 -; GCN-NEXT: v_readlane_b32 s15, v2, 63 -; GCN-NEXT: v_readlane_b32 s16, v2, 48 -; GCN-NEXT: v_readlane_b32 s17, v2, 49 -; GCN-NEXT: v_readlane_b32 s18, v2, 50 -; GCN-NEXT: v_readlane_b32 s19, v2, 51 -; GCN-NEXT: v_readlane_b32 s20, v2, 52 -; GCN-NEXT: v_readlane_b32 s21, v2, 53 -; GCN-NEXT: v_readlane_b32 s22, v2, 54 -; GCN-NEXT: v_readlane_b32 s23, v2, 55 -; GCN-NEXT: v_readlane_b32 s24, v2, 40 -; GCN-NEXT: v_readlane_b32 s25, v2, 41 -; GCN-NEXT: v_readlane_b32 s26, v2, 42 -; GCN-NEXT: v_readlane_b32 s27, v2, 43 -; GCN-NEXT: v_readlane_b32 s28, v2, 44 -; GCN-NEXT: v_readlane_b32 s29, v2, 45 -; GCN-NEXT: v_readlane_b32 s30, v2, 46 -; GCN-NEXT: v_readlane_b32 s31, v2, 47 -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 24 -; GCN-NEXT: v_readlane_b32 s45, v2, 25 -; GCN-NEXT: v_readlane_b32 s46, v2, 26 -; GCN-NEXT: v_readlane_b32 s47, v2, 27 -; GCN-NEXT: v_readlane_b32 s48, v2, 28 -; GCN-NEXT: v_readlane_b32 s49, v2, 29 -; GCN-NEXT: v_readlane_b32 s50, v2, 30 -; GCN-NEXT: v_readlane_b32 s51, v2, 31 -; GCN-NEXT: v_readlane_b32 s52, v2, 16 -; GCN-NEXT: v_readlane_b32 s53, v2, 17 -; GCN-NEXT: v_readlane_b32 s54, v2, 18 -; GCN-NEXT: v_readlane_b32 s55, v2, 19 -; GCN-NEXT: v_readlane_b32 s56, v2, 20 -; GCN-NEXT: v_readlane_b32 s57, v2, 21 -; GCN-NEXT: v_readlane_b32 s58, v2, 22 -; GCN-NEXT: v_readlane_b32 s59, v2, 23 -; GCN-NEXT: v_readlane_b32 s60, v2, 8 -; GCN-NEXT: v_readlane_b32 s61, v2, 9 -; GCN-NEXT: v_readlane_b32 s62, v2, 10 -; GCN-NEXT: v_readlane_b32 s63, v2, 11 -; GCN-NEXT: v_readlane_b32 s64, v2, 12 -; GCN-NEXT: v_readlane_b32 s65, v2, 13 -; GCN-NEXT: v_readlane_b32 s66, v2, 14 -; GCN-NEXT: v_readlane_b32 s67, v2, 15 -; GCN-NEXT: v_readlane_b32 s68, v2, 0 -; GCN-NEXT: v_readlane_b32 s69, v2, 1 -; GCN-NEXT: v_readlane_b32 s70, v2, 2 -; GCN-NEXT: v_readlane_b32 s71, v2, 3 -; GCN-NEXT: v_readlane_b32 s72, v2, 4 -; GCN-NEXT: v_readlane_b32 s73, v2, 5 -; GCN-NEXT: v_readlane_b32 s74, v2, 6 -; GCN-NEXT: v_readlane_b32 s75, v2, 7 -; GCN-NEXT: v_readlane_b32 s76, v1, 56 -; GCN-NEXT: v_readlane_b32 s77, v1, 57 -; GCN-NEXT: v_readlane_b32 s78, v1, 58 -; GCN-NEXT: v_readlane_b32 s79, v1, 59 -; GCN-NEXT: v_readlane_b32 s80, v1, 60 -; GCN-NEXT: v_readlane_b32 s81, v1, 61 -; GCN-NEXT: v_readlane_b32 s82, v1, 62 -; GCN-NEXT: v_readlane_b32 s83, v1, 63 -; GCN-NEXT: v_readlane_b32 s84, v1, 48 -; GCN-NEXT: v_readlane_b32 s85, v1, 49 -; GCN-NEXT: v_readlane_b32 s86, v1, 50 -; GCN-NEXT: v_readlane_b32 s87, v1, 51 -; GCN-NEXT: v_readlane_b32 s88, v1, 52 -; GCN-NEXT: v_readlane_b32 s89, v1, 53 -; GCN-NEXT: v_readlane_b32 s90, v1, 54 -; GCN-NEXT: v_readlane_b32 s91, v1, 55 -; GCN-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-NEXT: v_readlane_b32 s2, v1, 2 -; GCN-NEXT: v_readlane_b32 s3, v1, 3 -; GCN-NEXT: v_readlane_b32 s4, v1, 4 -; GCN-NEXT: v_readlane_b32 s5, v1, 5 -; GCN-NEXT: v_readlane_b32 s6, v1, 6 -; GCN-NEXT: v_readlane_b32 s7, v1, 7 +; GCN-NEXT: v_readlane_b32 s8, v1, 56 +; GCN-NEXT: v_readlane_b32 s9, v1, 57 +; GCN-NEXT: v_readlane_b32 s10, v1, 58 +; GCN-NEXT: v_readlane_b32 s11, v1, 59 +; GCN-NEXT: v_readlane_b32 s12, v1, 60 +; GCN-NEXT: v_readlane_b32 s13, v1, 61 +; GCN-NEXT: v_readlane_b32 s14, v1, 62 +; GCN-NEXT: v_readlane_b32 s15, v1, 63 +; GCN-NEXT: v_readlane_b32 s16, v1, 48 +; GCN-NEXT: v_readlane_b32 s17, v1, 49 +; GCN-NEXT: v_readlane_b32 s18, v1, 50 +; GCN-NEXT: v_readlane_b32 s19, v1, 51 +; GCN-NEXT: v_readlane_b32 s20, v1, 52 +; GCN-NEXT: v_readlane_b32 s21, v1, 53 +; GCN-NEXT: v_readlane_b32 s22, v1, 54 +; GCN-NEXT: v_readlane_b32 s23, v1, 55 +; GCN-NEXT: v_readlane_b32 s24, v1, 40 +; GCN-NEXT: v_readlane_b32 s25, v1, 41 +; GCN-NEXT: v_readlane_b32 s26, v1, 42 +; GCN-NEXT: v_readlane_b32 s27, v1, 43 +; GCN-NEXT: v_readlane_b32 s28, v1, 44 +; GCN-NEXT: v_readlane_b32 s29, v1, 45 +; GCN-NEXT: v_readlane_b32 s30, v1, 46 +; GCN-NEXT: v_readlane_b32 s31, v1, 47 +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 24 +; GCN-NEXT: v_readlane_b32 s45, v1, 25 +; GCN-NEXT: v_readlane_b32 s46, v1, 26 +; GCN-NEXT: v_readlane_b32 s47, v1, 27 +; GCN-NEXT: v_readlane_b32 s48, v1, 28 +; GCN-NEXT: v_readlane_b32 s49, v1, 29 +; GCN-NEXT: v_readlane_b32 s50, v1, 30 +; GCN-NEXT: v_readlane_b32 s51, v1, 31 +; GCN-NEXT: v_readlane_b32 s52, v1, 16 +; GCN-NEXT: v_readlane_b32 s53, v1, 17 +; GCN-NEXT: v_readlane_b32 s54, v1, 18 +; GCN-NEXT: v_readlane_b32 s55, v1, 19 +; GCN-NEXT: v_readlane_b32 s56, v1, 20 +; GCN-NEXT: v_readlane_b32 s57, v1, 21 +; GCN-NEXT: v_readlane_b32 s58, v1, 22 +; GCN-NEXT: v_readlane_b32 s59, v1, 23 +; GCN-NEXT: v_readlane_b32 s60, v1, 8 +; GCN-NEXT: v_readlane_b32 s61, v1, 9 +; GCN-NEXT: v_readlane_b32 s62, v1, 10 +; GCN-NEXT: v_readlane_b32 s63, v1, 11 +; GCN-NEXT: v_readlane_b32 s64, v1, 12 +; GCN-NEXT: v_readlane_b32 s65, v1, 13 +; GCN-NEXT: v_readlane_b32 s66, v1, 14 +; GCN-NEXT: v_readlane_b32 s67, v1, 15 +; GCN-NEXT: v_readlane_b32 s68, v1, 0 +; GCN-NEXT: v_readlane_b32 s69, v1, 1 +; GCN-NEXT: v_readlane_b32 s70, v1, 2 +; GCN-NEXT: v_readlane_b32 s71, v1, 3 +; GCN-NEXT: v_readlane_b32 s72, v1, 4 +; GCN-NEXT: v_readlane_b32 s73, v1, 5 +; GCN-NEXT: v_readlane_b32 s74, v1, 6 +; GCN-NEXT: v_readlane_b32 s75, v1, 7 +; GCN-NEXT: v_readlane_b32 s76, v0, 56 +; GCN-NEXT: v_readlane_b32 s77, v0, 57 +; GCN-NEXT: v_readlane_b32 s78, v0, 58 +; GCN-NEXT: v_readlane_b32 s79, v0, 59 +; GCN-NEXT: v_readlane_b32 s80, v0, 60 +; GCN-NEXT: v_readlane_b32 s81, v0, 61 +; GCN-NEXT: v_readlane_b32 s82, v0, 62 +; GCN-NEXT: v_readlane_b32 s83, v0, 63 +; GCN-NEXT: v_readlane_b32 s84, v0, 48 +; GCN-NEXT: v_readlane_b32 s85, v0, 49 +; GCN-NEXT: v_readlane_b32 s86, v0, 50 +; GCN-NEXT: v_readlane_b32 s87, v0, 51 +; GCN-NEXT: v_readlane_b32 s88, v0, 52 +; GCN-NEXT: v_readlane_b32 s89, v0, 53 +; GCN-NEXT: v_readlane_b32 s90, v0, 54 +; GCN-NEXT: v_readlane_b32 s91, v0, 55 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 8 -; GCN-NEXT: v_readlane_b32 s1, v1, 9 -; GCN-NEXT: v_readlane_b32 s2, v1, 10 -; GCN-NEXT: v_readlane_b32 s3, v1, 11 -; GCN-NEXT: v_readlane_b32 s4, v1, 12 -; GCN-NEXT: v_readlane_b32 s5, v1, 13 -; GCN-NEXT: v_readlane_b32 s6, v1, 14 -; GCN-NEXT: v_readlane_b32 s7, v1, 15 +; GCN-NEXT: v_readlane_b32 s0, v0, 8 +; GCN-NEXT: v_readlane_b32 s1, v0, 9 +; GCN-NEXT: v_readlane_b32 s2, v0, 10 +; GCN-NEXT: v_readlane_b32 s3, v0, 11 +; GCN-NEXT: v_readlane_b32 s4, v0, 12 +; GCN-NEXT: v_readlane_b32 s5, v0, 13 +; GCN-NEXT: v_readlane_b32 s6, v0, 14 +; GCN-NEXT: v_readlane_b32 s7, v0, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 16 -; GCN-NEXT: v_readlane_b32 s1, v1, 17 -; GCN-NEXT: v_readlane_b32 s2, v1, 18 -; GCN-NEXT: v_readlane_b32 s3, v1, 19 -; GCN-NEXT: v_readlane_b32 s4, v1, 20 -; GCN-NEXT: v_readlane_b32 s5, v1, 21 -; GCN-NEXT: v_readlane_b32 s6, v1, 22 -; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s0, v0, 16 +; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: v_readlane_b32 s2, v0, 18 +; GCN-NEXT: v_readlane_b32 s3, v0, 19 +; GCN-NEXT: v_readlane_b32 s4, v0, 20 +; GCN-NEXT: v_readlane_b32 s5, v0, 21 +; GCN-NEXT: v_readlane_b32 s6, v0, 22 +; GCN-NEXT: v_readlane_b32 s7, v0, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 24 -; GCN-NEXT: v_readlane_b32 s1, v1, 25 -; GCN-NEXT: v_readlane_b32 s2, v1, 26 -; GCN-NEXT: v_readlane_b32 s3, v1, 27 -; GCN-NEXT: v_readlane_b32 s4, v1, 28 -; GCN-NEXT: v_readlane_b32 s5, v1, 29 -; GCN-NEXT: v_readlane_b32 s6, v1, 30 -; GCN-NEXT: v_readlane_b32 s7, v1, 31 +; GCN-NEXT: v_readlane_b32 s0, v0, 24 +; GCN-NEXT: v_readlane_b32 s1, v0, 25 +; GCN-NEXT: v_readlane_b32 s2, v0, 26 +; GCN-NEXT: v_readlane_b32 s3, v0, 27 +; GCN-NEXT: v_readlane_b32 s4, v0, 28 +; GCN-NEXT: v_readlane_b32 s5, v0, 29 +; GCN-NEXT: v_readlane_b32 s6, v0, 30 +; GCN-NEXT: v_readlane_b32 s7, v0, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 32 -; GCN-NEXT: v_readlane_b32 s1, v1, 33 -; GCN-NEXT: v_readlane_b32 s2, v1, 34 -; GCN-NEXT: v_readlane_b32 s3, v1, 35 -; GCN-NEXT: v_readlane_b32 s4, v1, 36 -; GCN-NEXT: v_readlane_b32 s5, v1, 37 -; GCN-NEXT: v_readlane_b32 s6, v1, 38 -; GCN-NEXT: v_readlane_b32 s7, v1, 39 +; GCN-NEXT: v_readlane_b32 s0, v0, 32 +; GCN-NEXT: v_readlane_b32 s1, v0, 33 +; GCN-NEXT: v_readlane_b32 s2, v0, 34 +; GCN-NEXT: v_readlane_b32 s3, v0, 35 +; GCN-NEXT: v_readlane_b32 s4, v0, 36 +; GCN-NEXT: v_readlane_b32 s5, v0, 37 +; GCN-NEXT: v_readlane_b32 s6, v0, 38 +; GCN-NEXT: v_readlane_b32 s7, v0, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 40 -; GCN-NEXT: v_readlane_b32 s1, v1, 41 -; GCN-NEXT: v_readlane_b32 s2, v1, 42 -; GCN-NEXT: v_readlane_b32 s3, v1, 43 -; GCN-NEXT: v_readlane_b32 s4, v1, 44 -; GCN-NEXT: v_readlane_b32 s5, v1, 45 -; GCN-NEXT: v_readlane_b32 s6, v1, 46 -; GCN-NEXT: v_readlane_b32 s7, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v0, 40 +; GCN-NEXT: v_readlane_b32 s1, v0, 41 +; GCN-NEXT: v_readlane_b32 s2, v0, 42 +; GCN-NEXT: v_readlane_b32 s3, v0, 43 +; GCN-NEXT: v_readlane_b32 s4, v0, 44 +; GCN-NEXT: v_readlane_b32 s5, v0, 45 +; GCN-NEXT: v_readlane_b32 s6, v0, 46 +; GCN-NEXT: v_readlane_b32 s7, v0, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: v_readlane_b32 s0, v2, 0 +; GCN-NEXT: v_readlane_b32 s1, v2, 1 +; GCN-NEXT: v_readlane_b32 s2, v2, 2 +; GCN-NEXT: v_readlane_b32 s3, v2, 3 +; GCN-NEXT: v_readlane_b32 s4, v2, 4 +; GCN-NEXT: v_readlane_b32 s5, v2, 5 +; GCN-NEXT: v_readlane_b32 s6, v2, 6 +; GCN-NEXT: v_readlane_b32 s7, v2, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -470,17 +442,10 @@ ret: define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s54, -1 -; GCN-NEXT: s_mov_b32 s55, 0xe8f000 -; GCN-NEXT: s_add_u32 s52, s52, s3 -; GCN-NEXT: s_addc_u32 s53, s53, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -554,41 +519,27 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_writelane_b32 v0, s17, 61 ; GCN-NEXT: v_writelane_b32 v0, s18, 62 ; GCN-NEXT: v_writelane_b32 v0, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s2, 8 -; GCN-NEXT: v_writelane_b32 v0, s3, 9 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: v_writelane_b32 v1, s2, 8 +; GCN-NEXT: v_writelane_b32 v1, s3, 9 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -734,176 +685,176 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 -; GCN-NEXT: v_writelane_b32 v0, s12, 8 -; GCN-NEXT: v_writelane_b32 v0, s13, 9 -; GCN-NEXT: v_writelane_b32 v0, s14, 10 -; GCN-NEXT: v_writelane_b32 v0, s15, 11 -; GCN-NEXT: v_writelane_b32 v0, s16, 12 -; GCN-NEXT: v_writelane_b32 v0, s17, 13 -; GCN-NEXT: v_writelane_b32 v0, s18, 14 -; GCN-NEXT: v_writelane_b32 v0, s19, 15 +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-NEXT: v_writelane_b32 v0, s6, 18 -; GCN-NEXT: v_writelane_b32 v0, s7, 19 -; GCN-NEXT: v_writelane_b32 v0, s8, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 -; GCN-NEXT: v_writelane_b32 v0, s11, 23 -; GCN-NEXT: v_writelane_b32 v0, s12, 24 -; GCN-NEXT: v_writelane_b32 v0, s13, 25 -; GCN-NEXT: v_writelane_b32 v0, s14, 26 -; GCN-NEXT: v_writelane_b32 v0, s15, 27 -; GCN-NEXT: v_writelane_b32 v0, s16, 28 -; GCN-NEXT: v_writelane_b32 v0, s17, 29 -; GCN-NEXT: v_writelane_b32 v0, s18, 30 -; GCN-NEXT: v_writelane_b32 v0, s19, 31 +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 32 -; GCN-NEXT: v_writelane_b32 v0, s5, 33 -; GCN-NEXT: v_writelane_b32 v0, s6, 34 -; GCN-NEXT: v_writelane_b32 v0, s7, 35 -; GCN-NEXT: v_writelane_b32 v0, s8, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 -; GCN-NEXT: v_writelane_b32 v0, s11, 39 -; GCN-NEXT: v_writelane_b32 v0, s12, 40 -; GCN-NEXT: v_writelane_b32 v0, s13, 41 -; GCN-NEXT: v_writelane_b32 v0, s14, 42 -; GCN-NEXT: v_writelane_b32 v0, s15, 43 -; GCN-NEXT: v_writelane_b32 v0, s16, 44 -; GCN-NEXT: v_writelane_b32 v0, s17, 45 -; GCN-NEXT: v_writelane_b32 v0, s18, 46 -; GCN-NEXT: v_writelane_b32 v0, s19, 47 +; GCN-NEXT: v_writelane_b32 v31, s4, 32 +; GCN-NEXT: v_writelane_b32 v31, s5, 33 +; GCN-NEXT: v_writelane_b32 v31, s6, 34 +; GCN-NEXT: v_writelane_b32 v31, s7, 35 +; GCN-NEXT: v_writelane_b32 v31, s8, 36 +; GCN-NEXT: v_writelane_b32 v31, s9, 37 +; GCN-NEXT: v_writelane_b32 v31, s10, 38 +; GCN-NEXT: v_writelane_b32 v31, s11, 39 +; GCN-NEXT: v_writelane_b32 v31, s12, 40 +; GCN-NEXT: v_writelane_b32 v31, s13, 41 +; GCN-NEXT: v_writelane_b32 v31, s14, 42 +; GCN-NEXT: v_writelane_b32 v31, s15, 43 +; GCN-NEXT: v_writelane_b32 v31, s16, 44 +; GCN-NEXT: v_writelane_b32 v31, s17, 45 +; GCN-NEXT: v_writelane_b32 v31, s18, 46 +; GCN-NEXT: v_writelane_b32 v31, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 48 -; GCN-NEXT: v_writelane_b32 v0, s5, 49 -; GCN-NEXT: v_writelane_b32 v0, s6, 50 -; GCN-NEXT: v_writelane_b32 v0, s7, 51 -; GCN-NEXT: v_writelane_b32 v0, s8, 52 -; GCN-NEXT: v_writelane_b32 v0, s9, 53 -; GCN-NEXT: v_writelane_b32 v0, s10, 54 -; GCN-NEXT: v_writelane_b32 v0, s11, 55 -; GCN-NEXT: v_writelane_b32 v0, s12, 56 -; GCN-NEXT: v_writelane_b32 v0, s13, 57 -; GCN-NEXT: v_writelane_b32 v0, s14, 58 -; GCN-NEXT: v_writelane_b32 v0, s15, 59 -; GCN-NEXT: v_writelane_b32 v0, s16, 60 -; GCN-NEXT: v_writelane_b32 v0, s17, 61 -; GCN-NEXT: v_writelane_b32 v0, s18, 62 -; GCN-NEXT: v_writelane_b32 v0, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 40 -; GCN-NEXT: v_readlane_b32 s45, v1, 41 -; GCN-NEXT: v_readlane_b32 s46, v1, 42 -; GCN-NEXT: v_readlane_b32 s47, v1, 43 -; GCN-NEXT: v_readlane_b32 s48, v1, 44 -; GCN-NEXT: v_readlane_b32 s49, v1, 45 -; GCN-NEXT: v_readlane_b32 s50, v1, 46 -; GCN-NEXT: v_readlane_b32 s51, v1, 47 -; GCN-NEXT: v_readlane_b32 s0, v1, 16 -; GCN-NEXT: v_readlane_b32 s1, v1, 17 -; GCN-NEXT: v_readlane_b32 s2, v1, 18 -; GCN-NEXT: v_readlane_b32 s3, v1, 19 -; GCN-NEXT: v_readlane_b32 s4, v1, 20 -; GCN-NEXT: v_readlane_b32 s5, v1, 21 -; GCN-NEXT: v_readlane_b32 s6, v1, 22 -; GCN-NEXT: v_readlane_b32 s7, v1, 23 -; GCN-NEXT: v_readlane_b32 s8, v1, 24 -; GCN-NEXT: v_readlane_b32 s9, v1, 25 -; GCN-NEXT: v_readlane_b32 s10, v1, 26 -; GCN-NEXT: v_readlane_b32 s11, v1, 27 -; GCN-NEXT: v_readlane_b32 s12, v1, 28 -; GCN-NEXT: v_readlane_b32 s13, v1, 29 -; GCN-NEXT: v_readlane_b32 s14, v1, 30 -; GCN-NEXT: v_readlane_b32 s15, v1, 31 -; GCN-NEXT: v_readlane_b32 s16, v1, 0 -; GCN-NEXT: v_readlane_b32 s17, v1, 1 -; GCN-NEXT: v_readlane_b32 s18, v1, 2 -; GCN-NEXT: v_readlane_b32 s19, v1, 3 -; GCN-NEXT: v_readlane_b32 s20, v1, 4 -; GCN-NEXT: v_readlane_b32 s21, v1, 5 -; GCN-NEXT: v_readlane_b32 s22, v1, 6 -; GCN-NEXT: v_readlane_b32 s23, v1, 7 -; GCN-NEXT: v_readlane_b32 s24, v1, 8 -; GCN-NEXT: v_readlane_b32 s25, v1, 9 -; GCN-NEXT: v_readlane_b32 s26, v1, 10 -; GCN-NEXT: v_readlane_b32 s27, v1, 11 -; GCN-NEXT: v_readlane_b32 s28, v1, 12 -; GCN-NEXT: v_readlane_b32 s29, v1, 13 -; GCN-NEXT: v_readlane_b32 s30, v1, 14 -; GCN-NEXT: v_readlane_b32 s31, v1, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 48 -; GCN-NEXT: v_readlane_b32 s5, v1, 49 -; GCN-NEXT: v_readlane_b32 s6, v1, 50 -; GCN-NEXT: v_readlane_b32 s7, v1, 51 -; GCN-NEXT: v_readlane_b32 s8, v1, 52 -; GCN-NEXT: v_readlane_b32 s9, v1, 53 -; GCN-NEXT: v_readlane_b32 s10, v1, 54 -; GCN-NEXT: v_readlane_b32 s11, v1, 55 -; GCN-NEXT: v_readlane_b32 s12, v1, 56 -; GCN-NEXT: v_readlane_b32 s13, v1, 57 -; GCN-NEXT: v_readlane_b32 s14, v1, 58 -; GCN-NEXT: v_readlane_b32 s15, v1, 59 -; GCN-NEXT: v_readlane_b32 s16, v1, 60 -; GCN-NEXT: v_readlane_b32 s17, v1, 61 -; GCN-NEXT: v_readlane_b32 s18, v1, 62 -; GCN-NEXT: v_readlane_b32 s19, v1, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -969,152 +920,144 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 -; GCN-NEXT: v_writelane_b32 v0, s12, 8 -; GCN-NEXT: v_writelane_b32 v0, s13, 9 -; GCN-NEXT: v_writelane_b32 v0, s14, 10 -; GCN-NEXT: v_writelane_b32 v0, s15, 11 -; GCN-NEXT: v_writelane_b32 v0, s16, 12 -; GCN-NEXT: v_writelane_b32 v0, s17, 13 -; GCN-NEXT: v_writelane_b32 v0, s18, 14 -; GCN-NEXT: v_writelane_b32 v0, s19, 15 +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-NEXT: v_writelane_b32 v0, s6, 18 -; GCN-NEXT: v_writelane_b32 v0, s7, 19 -; GCN-NEXT: v_writelane_b32 v0, s8, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 -; GCN-NEXT: v_writelane_b32 v0, s11, 23 -; GCN-NEXT: v_writelane_b32 v0, s12, 24 -; GCN-NEXT: v_writelane_b32 v0, s13, 25 -; GCN-NEXT: v_writelane_b32 v0, s14, 26 -; GCN-NEXT: v_writelane_b32 v0, s15, 27 -; GCN-NEXT: v_writelane_b32 v0, s16, 28 -; GCN-NEXT: v_writelane_b32 v0, s17, 29 -; GCN-NEXT: v_writelane_b32 v0, s18, 30 -; GCN-NEXT: v_writelane_b32 v0, s19, 31 +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 32 -; GCN-NEXT: v_writelane_b32 v0, s5, 33 -; GCN-NEXT: v_writelane_b32 v0, s6, 34 -; GCN-NEXT: v_writelane_b32 v0, s7, 35 -; GCN-NEXT: v_writelane_b32 v0, s8, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 -; GCN-NEXT: v_writelane_b32 v0, s11, 39 -; GCN-NEXT: v_writelane_b32 v0, s12, 40 -; GCN-NEXT: v_writelane_b32 v0, s13, 41 -; GCN-NEXT: v_writelane_b32 v0, s14, 42 -; GCN-NEXT: v_writelane_b32 v0, s15, 43 -; GCN-NEXT: v_writelane_b32 v0, s16, 44 -; GCN-NEXT: v_writelane_b32 v0, s17, 45 -; GCN-NEXT: v_writelane_b32 v0, s18, 46 -; GCN-NEXT: v_writelane_b32 v0, s19, 47 +; GCN-NEXT: v_writelane_b32 v31, s4, 32 +; GCN-NEXT: v_writelane_b32 v31, s5, 33 +; GCN-NEXT: v_writelane_b32 v31, s6, 34 +; GCN-NEXT: v_writelane_b32 v31, s7, 35 +; GCN-NEXT: v_writelane_b32 v31, s8, 36 +; GCN-NEXT: v_writelane_b32 v31, s9, 37 +; GCN-NEXT: v_writelane_b32 v31, s10, 38 +; GCN-NEXT: v_writelane_b32 v31, s11, 39 +; GCN-NEXT: v_writelane_b32 v31, s12, 40 +; GCN-NEXT: v_writelane_b32 v31, s13, 41 +; GCN-NEXT: v_writelane_b32 v31, s14, 42 +; GCN-NEXT: v_writelane_b32 v31, s15, 43 +; GCN-NEXT: v_writelane_b32 v31, s16, 44 +; GCN-NEXT: v_writelane_b32 v31, s17, 45 +; GCN-NEXT: v_writelane_b32 v31, s18, 46 +; GCN-NEXT: v_writelane_b32 v31, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 48 -; GCN-NEXT: v_writelane_b32 v0, s5, 49 -; GCN-NEXT: v_writelane_b32 v0, s6, 50 -; GCN-NEXT: v_writelane_b32 v0, s7, 51 -; GCN-NEXT: v_writelane_b32 v0, s8, 52 -; GCN-NEXT: v_writelane_b32 v0, s9, 53 -; GCN-NEXT: v_writelane_b32 v0, s10, 54 -; GCN-NEXT: v_writelane_b32 v0, s11, 55 -; GCN-NEXT: v_writelane_b32 v0, s12, 56 -; GCN-NEXT: v_writelane_b32 v0, s13, 57 -; GCN-NEXT: v_writelane_b32 v0, s14, 58 -; GCN-NEXT: v_writelane_b32 v0, s15, 59 -; GCN-NEXT: v_writelane_b32 v0, s16, 60 -; GCN-NEXT: v_writelane_b32 v0, s17, 61 -; GCN-NEXT: v_writelane_b32 v0, s18, 62 -; GCN-NEXT: v_writelane_b32 v0, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 40 -; GCN-NEXT: v_readlane_b32 s45, v2, 41 -; GCN-NEXT: v_readlane_b32 s46, v2, 42 -; GCN-NEXT: v_readlane_b32 s47, v2, 43 -; GCN-NEXT: v_readlane_b32 s48, v2, 44 -; GCN-NEXT: v_readlane_b32 s49, v2, 45 -; GCN-NEXT: v_readlane_b32 s50, v2, 46 -; GCN-NEXT: v_readlane_b32 s51, v2, 47 -; GCN-NEXT: v_readlane_b32 s0, v2, 16 -; GCN-NEXT: v_readlane_b32 s1, v2, 17 -; GCN-NEXT: v_readlane_b32 s2, v2, 18 -; GCN-NEXT: v_readlane_b32 s3, v2, 19 -; GCN-NEXT: v_readlane_b32 s4, v2, 20 -; GCN-NEXT: v_readlane_b32 s5, v2, 21 -; GCN-NEXT: v_readlane_b32 s6, v2, 22 -; GCN-NEXT: v_readlane_b32 s7, v2, 23 -; GCN-NEXT: v_readlane_b32 s8, v2, 24 -; GCN-NEXT: v_readlane_b32 s9, v2, 25 -; GCN-NEXT: v_readlane_b32 s10, v2, 26 -; GCN-NEXT: v_readlane_b32 s11, v2, 27 -; GCN-NEXT: v_readlane_b32 s12, v2, 28 -; GCN-NEXT: v_readlane_b32 s13, v2, 29 -; GCN-NEXT: v_readlane_b32 s14, v2, 30 -; GCN-NEXT: v_readlane_b32 s15, v2, 31 -; GCN-NEXT: v_readlane_b32 s16, v2, 0 -; GCN-NEXT: v_readlane_b32 s17, v2, 1 -; GCN-NEXT: v_readlane_b32 s18, v2, 2 -; GCN-NEXT: v_readlane_b32 s19, v2, 3 -; GCN-NEXT: v_readlane_b32 s20, v2, 4 -; GCN-NEXT: v_readlane_b32 s21, v2, 5 -; GCN-NEXT: v_readlane_b32 s22, v2, 6 -; GCN-NEXT: v_readlane_b32 s23, v2, 7 -; GCN-NEXT: v_readlane_b32 s24, v2, 8 -; GCN-NEXT: v_readlane_b32 s25, v2, 9 -; GCN-NEXT: v_readlane_b32 s26, v2, 10 -; GCN-NEXT: v_readlane_b32 s27, v2, 11 -; GCN-NEXT: v_readlane_b32 s28, v2, 12 -; GCN-NEXT: v_readlane_b32 s29, v2, 13 -; GCN-NEXT: v_readlane_b32 s30, v2, 14 -; GCN-NEXT: v_readlane_b32 s31, v2, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1124,24 +1067,32 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v2, 48 -; GCN-NEXT: v_readlane_b32 s5, v2, 49 -; GCN-NEXT: v_readlane_b32 s6, v2, 50 -; GCN-NEXT: v_readlane_b32 s7, v2, 51 -; GCN-NEXT: v_readlane_b32 s8, v2, 52 -; GCN-NEXT: v_readlane_b32 s9, v2, 53 -; GCN-NEXT: v_readlane_b32 s10, v2, 54 -; GCN-NEXT: v_readlane_b32 s11, v2, 55 -; GCN-NEXT: v_readlane_b32 s12, v2, 56 -; GCN-NEXT: v_readlane_b32 s13, v2, 57 -; GCN-NEXT: v_readlane_b32 s14, v2, 58 -; GCN-NEXT: v_readlane_b32 s15, v2, 59 -; GCN-NEXT: v_readlane_b32 s16, v2, 60 -; GCN-NEXT: v_readlane_b32 s17, v2, 61 -; GCN-NEXT: v_readlane_b32 s18, v2, 62 -; GCN-NEXT: v_readlane_b32 s19, v2, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 2cbb505cd55d4..959bc7f33426b 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -1,377 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s -; This was a negative test to catch an extreme case when all options are exhausted -; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs -; the edge case won't arise and the test would always compile. +; This ends up needing to spill SGPRs to memory, and also does not +; have any free SGPRs available to save the exec mask when doing so. +; The register scavenger also needs to use the emergency stack slot, +; which tries to place the scavenged register restore instruction as +; far the block as possible, near the terminator. This places a +; restore instruction between the condition and the conditional +; branch, which gets expanded into a sequence involving s_not_b64 on +; the exec mask, clobbering SCC value before the branch. We probably +; have to stop relying on being able to flip and restore the exec +; mask, and always require a free SGPR for saving exec. -define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { -; CHECK-LABEL: kernel0: -; CHECK: ; %bb.0: -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: v_writelane_b32 v0, s2, 0 -; CHECK-NEXT: v_writelane_b32 v0, s3, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s4, 2 -; CHECK-NEXT: v_writelane_b32 v0, s5, 3 -; CHECK-NEXT: v_writelane_b32 v0, s6, 4 -; CHECK-NEXT: v_writelane_b32 v0, s7, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s4, 6 -; CHECK-NEXT: v_writelane_b32 v0, s5, 7 -; CHECK-NEXT: v_writelane_b32 v0, s6, 8 -; CHECK-NEXT: v_writelane_b32 v0, s7, 9 -; CHECK-NEXT: v_writelane_b32 v0, s8, 10 -; CHECK-NEXT: v_writelane_b32 v0, s9, 11 -; CHECK-NEXT: v_writelane_b32 v0, s10, 12 -; CHECK-NEXT: v_writelane_b32 v0, s11, 13 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:19] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s4, 14 -; CHECK-NEXT: v_writelane_b32 v0, s5, 15 -; CHECK-NEXT: v_writelane_b32 v0, s6, 16 -; CHECK-NEXT: v_writelane_b32 v0, s7, 17 -; CHECK-NEXT: v_writelane_b32 v0, s8, 18 -; CHECK-NEXT: v_writelane_b32 v0, s9, 19 -; CHECK-NEXT: v_writelane_b32 v0, s10, 20 -; CHECK-NEXT: v_writelane_b32 v0, s11, 21 -; CHECK-NEXT: v_writelane_b32 v0, s12, 22 -; CHECK-NEXT: v_writelane_b32 v0, s13, 23 -; CHECK-NEXT: v_writelane_b32 v0, s14, 24 -; CHECK-NEXT: v_writelane_b32 v0, s15, 25 -; CHECK-NEXT: v_writelane_b32 v0, s16, 26 -; CHECK-NEXT: v_writelane_b32 v0, s17, 27 -; CHECK-NEXT: v_writelane_b32 v0, s18, 28 -; CHECK-NEXT: v_writelane_b32 v0, s19, 29 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s2, 30 -; CHECK-NEXT: v_writelane_b32 v0, s3, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s4, 32 -; CHECK-NEXT: v_writelane_b32 v0, s5, 33 -; CHECK-NEXT: v_writelane_b32 v0, s6, 34 -; CHECK-NEXT: v_writelane_b32 v0, s7, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[16:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:43] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 44 -; CHECK-NEXT: v_writelane_b32 v0, s1, 45 -; CHECK-NEXT: v_writelane_b32 v0, s2, 46 -; CHECK-NEXT: v_writelane_b32 v0, s3, 47 -; CHECK-NEXT: v_writelane_b32 v0, s4, 48 -; CHECK-NEXT: v_writelane_b32 v0, s5, 49 -; CHECK-NEXT: v_writelane_b32 v0, s6, 50 -; CHECK-NEXT: v_writelane_b32 v0, s7, 51 -; CHECK-NEXT: v_writelane_b32 v0, s8, 52 -; CHECK-NEXT: v_writelane_b32 v0, s9, 53 -; CHECK-NEXT: v_writelane_b32 v0, s10, 54 -; CHECK-NEXT: v_writelane_b32 v0, s11, 55 -; CHECK-NEXT: v_writelane_b32 v0, s12, 56 -; CHECK-NEXT: v_writelane_b32 v0, s13, 57 -; CHECK-NEXT: v_writelane_b32 v0, s14, 58 -; CHECK-NEXT: v_writelane_b32 v0, s15, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[34:35] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr1 -; CHECK-NEXT: v_writelane_b32 v0, s0, 60 -; CHECK-NEXT: v_writelane_b32 v1, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s1, 61 -; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: v_writelane_b32 v0, s2, 62 -; CHECK-NEXT: v_writelane_b32 v1, s6, 2 -; CHECK-NEXT: v_writelane_b32 v0, s3, 63 -; CHECK-NEXT: v_writelane_b32 v1, s7, 3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v1, s0, 4 -; CHECK-NEXT: v_writelane_b32 v1, s1, 5 -; CHECK-NEXT: v_writelane_b32 v1, s2, 6 -; CHECK-NEXT: v_writelane_b32 v1, s3, 7 -; CHECK-NEXT: v_writelane_b32 v1, s4, 8 -; CHECK-NEXT: v_writelane_b32 v1, s5, 9 -; CHECK-NEXT: v_writelane_b32 v1, s6, 10 -; CHECK-NEXT: v_writelane_b32 v1, s7, 11 -; CHECK-NEXT: v_writelane_b32 v1, s8, 12 -; CHECK-NEXT: v_writelane_b32 v1, s9, 13 -; CHECK-NEXT: v_writelane_b32 v1, s10, 14 -; CHECK-NEXT: v_writelane_b32 v1, s11, 15 -; CHECK-NEXT: v_writelane_b32 v1, s12, 16 -; CHECK-NEXT: v_writelane_b32 v1, s13, 17 -; CHECK-NEXT: v_writelane_b32 v1, s14, 18 -; CHECK-NEXT: v_writelane_b32 v1, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v1, s0, 20 -; CHECK-NEXT: v_writelane_b32 v1, s1, 21 -; CHECK-NEXT: v_writelane_b32 v1, s2, 22 -; CHECK-NEXT: v_writelane_b32 v1, s3, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v1, s0, 24 -; CHECK-NEXT: v_writelane_b32 v1, s1, 25 -; CHECK-NEXT: v_writelane_b32 v1, s2, 26 -; CHECK-NEXT: v_writelane_b32 v1, s3, 27 -; CHECK-NEXT: v_writelane_b32 v1, s4, 28 -; CHECK-NEXT: v_writelane_b32 v1, s5, 29 -; CHECK-NEXT: v_writelane_b32 v1, s6, 30 -; CHECK-NEXT: v_writelane_b32 v1, s7, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v1, s0, 32 -; CHECK-NEXT: v_writelane_b32 v1, s1, 33 -; CHECK-NEXT: v_writelane_b32 v1, s2, 34 -; CHECK-NEXT: v_writelane_b32 v1, s3, 35 -; CHECK-NEXT: v_writelane_b32 v1, s4, 36 -; CHECK-NEXT: v_writelane_b32 v1, s5, 37 -; CHECK-NEXT: v_writelane_b32 v1, s6, 38 -; CHECK-NEXT: v_writelane_b32 v1, s7, 39 -; CHECK-NEXT: v_writelane_b32 v1, s8, 40 -; CHECK-NEXT: v_writelane_b32 v1, s9, 41 -; CHECK-NEXT: v_writelane_b32 v1, s10, 42 -; CHECK-NEXT: v_writelane_b32 v1, s11, 43 -; CHECK-NEXT: v_writelane_b32 v1, s12, 44 -; CHECK-NEXT: v_writelane_b32 v1, s13, 45 -; CHECK-NEXT: v_writelane_b32 v1, s14, 46 -; CHECK-NEXT: v_writelane_b32 v1, s15, 47 -; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 -; CHECK-NEXT: ; %bb.1: ; %ret -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_2: ; %bb0 -; CHECK-NEXT: v_readlane_b32 s0, v0, 0 -; CHECK-NEXT: v_readlane_b32 s1, v0, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 2 -; CHECK-NEXT: v_readlane_b32 s1, v0, 3 -; CHECK-NEXT: v_readlane_b32 s2, v0, 4 -; CHECK-NEXT: v_readlane_b32 s3, v0, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 6 -; CHECK-NEXT: v_readlane_b32 s1, v0, 7 -; CHECK-NEXT: v_readlane_b32 s2, v0, 8 -; CHECK-NEXT: v_readlane_b32 s3, v0, 9 -; CHECK-NEXT: v_readlane_b32 s4, v0, 10 -; CHECK-NEXT: v_readlane_b32 s5, v0, 11 -; CHECK-NEXT: v_readlane_b32 s6, v0, 12 -; CHECK-NEXT: v_readlane_b32 s7, v0, 13 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 14 -; CHECK-NEXT: v_readlane_b32 s1, v0, 15 -; CHECK-NEXT: v_readlane_b32 s2, v0, 16 -; CHECK-NEXT: v_readlane_b32 s3, v0, 17 -; CHECK-NEXT: v_readlane_b32 s4, v0, 18 -; CHECK-NEXT: v_readlane_b32 s5, v0, 19 -; CHECK-NEXT: v_readlane_b32 s6, v0, 20 -; CHECK-NEXT: v_readlane_b32 s7, v0, 21 -; CHECK-NEXT: v_readlane_b32 s8, v0, 22 -; CHECK-NEXT: v_readlane_b32 s9, v0, 23 -; CHECK-NEXT: v_readlane_b32 s10, v0, 24 -; CHECK-NEXT: v_readlane_b32 s11, v0, 25 -; CHECK-NEXT: v_readlane_b32 s12, v0, 26 -; CHECK-NEXT: v_readlane_b32 s13, v0, 27 -; CHECK-NEXT: v_readlane_b32 s14, v0, 28 -; CHECK-NEXT: v_readlane_b32 s15, v0, 29 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 30 -; CHECK-NEXT: v_readlane_b32 s1, v0, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 36 -; CHECK-NEXT: v_readlane_b32 s1, v0, 37 -; CHECK-NEXT: v_readlane_b32 s2, v0, 38 -; CHECK-NEXT: v_readlane_b32 s3, v0, 39 -; CHECK-NEXT: v_readlane_b32 s4, v0, 40 -; CHECK-NEXT: v_readlane_b32 s5, v0, 41 -; CHECK-NEXT: v_readlane_b32 s6, v0, 42 -; CHECK-NEXT: v_readlane_b32 s7, v0, 43 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 44 -; CHECK-NEXT: v_readlane_b32 s1, v0, 45 -; CHECK-NEXT: v_readlane_b32 s2, v0, 46 -; CHECK-NEXT: v_readlane_b32 s3, v0, 47 -; CHECK-NEXT: v_readlane_b32 s4, v0, 48 -; CHECK-NEXT: v_readlane_b32 s5, v0, 49 -; CHECK-NEXT: v_readlane_b32 s6, v0, 50 -; CHECK-NEXT: v_readlane_b32 s7, v0, 51 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[16:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:43] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v0, 52 -; CHECK-NEXT: v_readlane_b32 s9, v0, 53 -; CHECK-NEXT: v_readlane_b32 s10, v0, 54 -; CHECK-NEXT: v_readlane_b32 s11, v0, 55 -; CHECK-NEXT: v_readlane_b32 s12, v0, 56 -; CHECK-NEXT: v_readlane_b32 s13, v0, 57 -; CHECK-NEXT: v_readlane_b32 s14, v0, 58 -; CHECK-NEXT: v_readlane_b32 s15, v0, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 60 -; CHECK-NEXT: v_readlane_b32 s1, v0, 61 -; CHECK-NEXT: v_readlane_b32 s2, v0, 62 -; CHECK-NEXT: v_readlane_b32 s3, v0, 63 -; CHECK-NEXT: v_readlane_b32 s4, v1, 0 -; CHECK-NEXT: v_readlane_b32 s5, v1, 1 -; CHECK-NEXT: v_readlane_b32 s6, v1, 2 -; CHECK-NEXT: v_readlane_b32 s7, v1, 3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[34:35] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v1, 4 -; CHECK-NEXT: v_readlane_b32 s1, v1, 5 -; CHECK-NEXT: v_readlane_b32 s2, v1, 6 -; CHECK-NEXT: v_readlane_b32 s3, v1, 7 -; CHECK-NEXT: v_readlane_b32 s4, v1, 8 -; CHECK-NEXT: v_readlane_b32 s5, v1, 9 -; CHECK-NEXT: v_readlane_b32 s6, v1, 10 -; CHECK-NEXT: v_readlane_b32 s7, v1, 11 -; CHECK-NEXT: v_readlane_b32 s8, v1, 12 -; CHECK-NEXT: v_readlane_b32 s9, v1, 13 -; CHECK-NEXT: v_readlane_b32 s10, v1, 14 -; CHECK-NEXT: v_readlane_b32 s11, v1, 15 -; CHECK-NEXT: v_readlane_b32 s12, v1, 16 -; CHECK-NEXT: v_readlane_b32 s13, v1, 17 -; CHECK-NEXT: v_readlane_b32 s14, v1, 18 -; CHECK-NEXT: v_readlane_b32 s15, v1, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v1, 20 -; CHECK-NEXT: v_readlane_b32 s1, v1, 21 -; CHECK-NEXT: v_readlane_b32 s2, v1, 22 -; CHECK-NEXT: v_readlane_b32 s3, v1, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v1, 24 -; CHECK-NEXT: v_readlane_b32 s1, v1, 25 -; CHECK-NEXT: v_readlane_b32 s2, v1, 26 -; CHECK-NEXT: v_readlane_b32 s3, v1, 27 -; CHECK-NEXT: v_readlane_b32 s4, v1, 28 -; CHECK-NEXT: v_readlane_b32 s5, v1, 29 -; CHECK-NEXT: v_readlane_b32 s6, v1, 30 -; CHECK-NEXT: v_readlane_b32 s7, v1, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v1, 32 -; CHECK-NEXT: v_readlane_b32 s1, v1, 33 -; CHECK-NEXT: v_readlane_b32 s2, v1, 34 -; CHECK-NEXT: v_readlane_b32 s3, v1, 35 -; CHECK-NEXT: v_readlane_b32 s4, v1, 36 -; CHECK-NEXT: v_readlane_b32 s5, v1, 37 -; CHECK-NEXT: v_readlane_b32 s6, v1, 38 -; CHECK-NEXT: v_readlane_b32 s7, v1, 39 -; CHECK-NEXT: v_readlane_b32 s8, v1, 40 -; CHECK-NEXT: v_readlane_b32 s9, v1, 41 -; CHECK-NEXT: v_readlane_b32 s10, v1, 42 -; CHECK-NEXT: v_readlane_b32 s11, v1, 43 -; CHECK-NEXT: v_readlane_b32 s12, v1, 44 -; CHECK-NEXT: v_readlane_b32 s13, v1, 45 -; CHECK-NEXT: v_readlane_b32 s14, v1, 46 -; CHECK-NEXT: v_readlane_b32 s15, v1, 47 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_endpgm +; CHECK: *** Bad machine code: Using an undefined physical register *** +; CHECK-NEXT: - function: kernel0 +; CHECK-NEXT: - basic block: %bb.0 +; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc +; CHECK-NEXT: - operand 1: implicit killed $scc +define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 { call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir index 3ce4c6c67718b..687adc69bd148 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,6 +1,4 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. # Otherwise, the test would crash during PEI while trying to replace the dead frame index. @@ -41,21 +39,13 @@ machineFunctionInfo: workGroupIDX: { reg: '$sgpr8' } privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | - ; SGPR_SPILL-LABEL: name: test - ; SGPR_SPILL: bb.0: - ; SGPR_SPILL: [[VGPR:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; SGPR_SPILL: [[VGPR]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[VGPR]] - ; SGPR_SPILL: DBG_VALUE $noreg, 0 - ; SGPR_SPILL: bb.1: - ; SGPR_SPILL: $sgpr10 = V_READLANE_B32 [[VGPR]], 0 - ; SGPR_SPILL: S_ENDPGM 0 - ; PEI-LABEL: name: test - ; PEI: bb.0: - ; PEI: renamable $[[VGPR:vgpr[0-9]+]] = IMPLICIT_DEF - ; PEI: renamable $[[VGPR]] = V_WRITELANE_B32 killed $sgpr10, 0, killed $[[VGPR]] - ; PEI: bb.1: - ; PEI: $sgpr10 = V_READLANE_B32 killed $[[VGPR]], 0 - ; PEI: S_ENDPGM 0 + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 + ; CHECK: DBG_VALUE $noreg, 0 + ; CHECK: bb.1: + ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 + ; CHECK: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir index a6cb7d4af7641..4694810379fe0 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0. # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise. @@ -45,7 +45,7 @@ machineFunctionInfo: body: | ; CHECK-LABEL: name: test ; CHECK: bb.0: - ; CHECK: DBG_VALUE + ; CHECK: DBG_VALUE $noreg, 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 804779d5a63f8..16aadade906e9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -1,10 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; This test was originally written when SGPRs are spilled directly to physical VGPRs and -; stressed a case when there wasn't enough VGPRs to accommodate all spills. -; When we started spilling them into virtual VGPR lanes, we always succeed in doing so. -; The regalloc pass later takes care of allocating VGPRs to these virtual registers. +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: @@ -25,179 +23,179 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s8, 0 -; GCN-NEXT: v_writelane_b32 v0, s9, 1 -; GCN-NEXT: v_writelane_b32 v0, s10, 2 -; GCN-NEXT: v_writelane_b32 v0, s11, 3 -; GCN-NEXT: v_writelane_b32 v0, s12, 4 -; GCN-NEXT: v_writelane_b32 v0, s13, 5 -; GCN-NEXT: v_writelane_b32 v0, s14, 6 -; GCN-NEXT: v_writelane_b32 v0, s15, 7 -; GCN-NEXT: v_writelane_b32 v0, s16, 8 -; GCN-NEXT: v_writelane_b32 v0, s17, 9 -; GCN-NEXT: v_writelane_b32 v0, s18, 10 -; GCN-NEXT: v_writelane_b32 v0, s19, 11 -; GCN-NEXT: v_writelane_b32 v0, s20, 12 -; GCN-NEXT: v_writelane_b32 v0, s21, 13 -; GCN-NEXT: v_writelane_b32 v0, s22, 14 -; GCN-NEXT: v_writelane_b32 v0, s23, 15 +; GCN-NEXT: v_writelane_b32 v23, s8, 0 +; GCN-NEXT: v_writelane_b32 v23, s9, 1 +; GCN-NEXT: v_writelane_b32 v23, s10, 2 +; GCN-NEXT: v_writelane_b32 v23, s11, 3 +; GCN-NEXT: v_writelane_b32 v23, s12, 4 +; GCN-NEXT: v_writelane_b32 v23, s13, 5 +; GCN-NEXT: v_writelane_b32 v23, s14, 6 +; GCN-NEXT: v_writelane_b32 v23, s15, 7 +; GCN-NEXT: v_writelane_b32 v23, s16, 8 +; GCN-NEXT: v_writelane_b32 v23, s17, 9 +; GCN-NEXT: v_writelane_b32 v23, s18, 10 +; GCN-NEXT: v_writelane_b32 v23, s19, 11 +; GCN-NEXT: v_writelane_b32 v23, s20, 12 +; GCN-NEXT: v_writelane_b32 v23, s21, 13 +; GCN-NEXT: v_writelane_b32 v23, s22, 14 +; GCN-NEXT: v_writelane_b32 v23, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s8, 16 -; GCN-NEXT: v_writelane_b32 v0, s9, 17 -; GCN-NEXT: v_writelane_b32 v0, s10, 18 -; GCN-NEXT: v_writelane_b32 v0, s11, 19 -; GCN-NEXT: v_writelane_b32 v0, s12, 20 -; GCN-NEXT: v_writelane_b32 v0, s13, 21 -; GCN-NEXT: v_writelane_b32 v0, s14, 22 -; GCN-NEXT: v_writelane_b32 v0, s15, 23 -; GCN-NEXT: v_writelane_b32 v0, s16, 24 -; GCN-NEXT: v_writelane_b32 v0, s17, 25 -; GCN-NEXT: v_writelane_b32 v0, s18, 26 -; GCN-NEXT: v_writelane_b32 v0, s19, 27 -; GCN-NEXT: v_writelane_b32 v0, s20, 28 -; GCN-NEXT: v_writelane_b32 v0, s21, 29 -; GCN-NEXT: v_writelane_b32 v0, s22, 30 -; GCN-NEXT: v_writelane_b32 v0, s23, 31 +; GCN-NEXT: v_writelane_b32 v23, s8, 16 +; GCN-NEXT: v_writelane_b32 v23, s9, 17 +; GCN-NEXT: v_writelane_b32 v23, s10, 18 +; GCN-NEXT: v_writelane_b32 v23, s11, 19 +; GCN-NEXT: v_writelane_b32 v23, s12, 20 +; GCN-NEXT: v_writelane_b32 v23, s13, 21 +; GCN-NEXT: v_writelane_b32 v23, s14, 22 +; GCN-NEXT: v_writelane_b32 v23, s15, 23 +; GCN-NEXT: v_writelane_b32 v23, s16, 24 +; GCN-NEXT: v_writelane_b32 v23, s17, 25 +; GCN-NEXT: v_writelane_b32 v23, s18, 26 +; GCN-NEXT: v_writelane_b32 v23, s19, 27 +; GCN-NEXT: v_writelane_b32 v23, s20, 28 +; GCN-NEXT: v_writelane_b32 v23, s21, 29 +; GCN-NEXT: v_writelane_b32 v23, s22, 30 +; GCN-NEXT: v_writelane_b32 v23, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s8, 32 -; GCN-NEXT: v_writelane_b32 v0, s9, 33 -; GCN-NEXT: v_writelane_b32 v0, s10, 34 -; GCN-NEXT: v_writelane_b32 v0, s11, 35 -; GCN-NEXT: v_writelane_b32 v0, s12, 36 -; GCN-NEXT: v_writelane_b32 v0, s13, 37 -; GCN-NEXT: v_writelane_b32 v0, s14, 38 -; GCN-NEXT: v_writelane_b32 v0, s15, 39 -; GCN-NEXT: v_writelane_b32 v0, s16, 40 -; GCN-NEXT: v_writelane_b32 v0, s17, 41 -; GCN-NEXT: v_writelane_b32 v0, s18, 42 -; GCN-NEXT: v_writelane_b32 v0, s19, 43 -; GCN-NEXT: v_writelane_b32 v0, s20, 44 -; GCN-NEXT: v_writelane_b32 v0, s21, 45 -; GCN-NEXT: v_writelane_b32 v0, s22, 46 -; GCN-NEXT: v_writelane_b32 v0, s23, 47 +; GCN-NEXT: v_writelane_b32 v23, s8, 32 +; GCN-NEXT: v_writelane_b32 v23, s9, 33 +; GCN-NEXT: v_writelane_b32 v23, s10, 34 +; GCN-NEXT: v_writelane_b32 v23, s11, 35 +; GCN-NEXT: v_writelane_b32 v23, s12, 36 +; GCN-NEXT: v_writelane_b32 v23, s13, 37 +; GCN-NEXT: v_writelane_b32 v23, s14, 38 +; GCN-NEXT: v_writelane_b32 v23, s15, 39 +; GCN-NEXT: v_writelane_b32 v23, s16, 40 +; GCN-NEXT: v_writelane_b32 v23, s17, 41 +; GCN-NEXT: v_writelane_b32 v23, s18, 42 +; GCN-NEXT: v_writelane_b32 v23, s19, 43 +; GCN-NEXT: v_writelane_b32 v23, s20, 44 +; GCN-NEXT: v_writelane_b32 v23, s21, 45 +; GCN-NEXT: v_writelane_b32 v23, s22, 46 +; GCN-NEXT: v_writelane_b32 v23, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s8, 48 -; GCN-NEXT: v_writelane_b32 v0, s9, 49 -; GCN-NEXT: v_writelane_b32 v0, s10, 50 -; GCN-NEXT: v_writelane_b32 v0, s11, 51 -; GCN-NEXT: v_writelane_b32 v0, s12, 52 -; GCN-NEXT: v_writelane_b32 v0, s13, 53 -; GCN-NEXT: v_writelane_b32 v0, s14, 54 -; GCN-NEXT: v_writelane_b32 v0, s15, 55 -; GCN-NEXT: v_writelane_b32 v0, s16, 56 -; GCN-NEXT: v_writelane_b32 v0, s17, 57 -; GCN-NEXT: v_writelane_b32 v0, s18, 58 -; GCN-NEXT: v_writelane_b32 v0, s19, 59 -; GCN-NEXT: v_writelane_b32 v0, s20, 60 -; GCN-NEXT: v_writelane_b32 v0, s21, 61 -; GCN-NEXT: v_writelane_b32 v0, s22, 62 -; GCN-NEXT: v_writelane_b32 v0, s23, 63 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: v_writelane_b32 v23, s8, 48 +; GCN-NEXT: v_writelane_b32 v23, s9, 49 +; GCN-NEXT: v_writelane_b32 v23, s10, 50 +; GCN-NEXT: v_writelane_b32 v23, s11, 51 +; GCN-NEXT: v_writelane_b32 v23, s12, 52 +; GCN-NEXT: v_writelane_b32 v23, s13, 53 +; GCN-NEXT: v_writelane_b32 v23, s14, 54 +; GCN-NEXT: v_writelane_b32 v23, s15, 55 +; GCN-NEXT: v_writelane_b32 v23, s16, 56 +; GCN-NEXT: v_writelane_b32 v23, s17, 57 +; GCN-NEXT: v_writelane_b32 v23, s18, 58 +; GCN-NEXT: v_writelane_b32 v23, s19, 59 +; GCN-NEXT: v_writelane_b32 v23, s20, 60 +; GCN-NEXT: v_writelane_b32 v23, s21, 61 +; GCN-NEXT: v_writelane_b32 v23, s22, 62 +; GCN-NEXT: v_writelane_b32 v23, s23, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 -; GCN-NEXT: v_readlane_b32 s6, v1, 2 -; GCN-NEXT: v_readlane_b32 s7, v1, 3 -; GCN-NEXT: v_readlane_b32 s8, v1, 4 -; GCN-NEXT: v_readlane_b32 s9, v1, 5 -; GCN-NEXT: v_readlane_b32 s10, v1, 6 -; GCN-NEXT: v_readlane_b32 s11, v1, 7 -; GCN-NEXT: v_readlane_b32 s12, v1, 8 -; GCN-NEXT: v_readlane_b32 s13, v1, 9 -; GCN-NEXT: v_readlane_b32 s14, v1, 10 -; GCN-NEXT: v_readlane_b32 s15, v1, 11 -; GCN-NEXT: v_readlane_b32 s16, v1, 12 -; GCN-NEXT: v_readlane_b32 s17, v1, 13 -; GCN-NEXT: v_readlane_b32 s18, v1, 14 -; GCN-NEXT: v_readlane_b32 s19, v1, 15 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 16 -; GCN-NEXT: v_readlane_b32 s5, v1, 17 -; GCN-NEXT: v_readlane_b32 s6, v1, 18 -; GCN-NEXT: v_readlane_b32 s7, v1, 19 -; GCN-NEXT: v_readlane_b32 s8, v1, 20 -; GCN-NEXT: v_readlane_b32 s9, v1, 21 -; GCN-NEXT: v_readlane_b32 s10, v1, 22 -; GCN-NEXT: v_readlane_b32 s11, v1, 23 -; GCN-NEXT: v_readlane_b32 s12, v1, 24 -; GCN-NEXT: v_readlane_b32 s13, v1, 25 -; GCN-NEXT: v_readlane_b32 s14, v1, 26 -; GCN-NEXT: v_readlane_b32 s15, v1, 27 -; GCN-NEXT: v_readlane_b32 s16, v1, 28 -; GCN-NEXT: v_readlane_b32 s17, v1, 29 -; GCN-NEXT: v_readlane_b32 s18, v1, 30 -; GCN-NEXT: v_readlane_b32 s19, v1, 31 +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 32 -; GCN-NEXT: v_readlane_b32 s5, v1, 33 -; GCN-NEXT: v_readlane_b32 s6, v1, 34 -; GCN-NEXT: v_readlane_b32 s7, v1, 35 -; GCN-NEXT: v_readlane_b32 s8, v1, 36 -; GCN-NEXT: v_readlane_b32 s9, v1, 37 -; GCN-NEXT: v_readlane_b32 s10, v1, 38 -; GCN-NEXT: v_readlane_b32 s11, v1, 39 -; GCN-NEXT: v_readlane_b32 s12, v1, 40 -; GCN-NEXT: v_readlane_b32 s13, v1, 41 -; GCN-NEXT: v_readlane_b32 s14, v1, 42 -; GCN-NEXT: v_readlane_b32 s15, v1, 43 -; GCN-NEXT: v_readlane_b32 s16, v1, 44 -; GCN-NEXT: v_readlane_b32 s17, v1, 45 -; GCN-NEXT: v_readlane_b32 s18, v1, 46 -; GCN-NEXT: v_readlane_b32 s19, v1, 47 +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v1, 48 -; GCN-NEXT: v_readlane_b32 s9, v1, 49 -; GCN-NEXT: v_readlane_b32 s10, v1, 50 -; GCN-NEXT: v_readlane_b32 s11, v1, 51 -; GCN-NEXT: v_readlane_b32 s12, v1, 52 -; GCN-NEXT: v_readlane_b32 s13, v1, 53 -; GCN-NEXT: v_readlane_b32 s14, v1, 54 -; GCN-NEXT: v_readlane_b32 s15, v1, 55 -; GCN-NEXT: v_readlane_b32 s16, v1, 56 -; GCN-NEXT: v_readlane_b32 s17, v1, 57 -; GCN-NEXT: v_readlane_b32 s18, v1, 58 -; GCN-NEXT: v_readlane_b32 s19, v1, 59 -; GCN-NEXT: v_readlane_b32 s20, v1, 60 -; GCN-NEXT: v_readlane_b32 s21, v1, 61 -; GCN-NEXT: v_readlane_b32 s22, v1, 62 -; GCN-NEXT: v_readlane_b32 s23, v1, 63 +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir index 26a5eedc3eca3..9596d3b7f6359 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -20,11 +20,10 @@ body: | liveins: $sgpr4 ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 - ; CHECK: liveins: $sgpr4 + ; CHECK: liveins: $sgpr4, $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -46,11 +45,10 @@ body: | liveins: $sgpr5 ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 - ; CHECK: liveins: $sgpr5 + ; CHECK: liveins: $sgpr5, $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll index c0364b4d0e90d..b20f540cf2472 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -12,17 +12,16 @@ define amdgpu_kernel void @kernel() { ; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 -; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_mov_b32 s39, 0xe00000 -; GCN-NEXT: v_writelane_b32 v3, s4, 0 +; GCN-NEXT: v_writelane_b32 v40, s4, 0 ; GCN-NEXT: s_add_u32 s36, s36, s11 -; GCN-NEXT: v_writelane_b32 v3, s5, 1 +; GCN-NEXT: v_writelane_b32 v40, s5, 1 ; GCN-NEXT: s_addc_u32 s37, s37, 0 ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: v_readlane_b32 s0, v3, 0 +; GCN-NEXT: v_readlane_b32 s0, v40, 0 ; GCN-NEXT: s_mov_b32 s13, s9 ; GCN-NEXT: s_mov_b32 s12, s8 -; GCN-NEXT: v_readlane_b32 s1, v3, 1 +; GCN-NEXT: v_readlane_b32 s1, v40, 1 ; GCN-NEXT: s_add_u32 s8, s0, 36 ; GCN-NEXT: s_addc_u32 s9, s1, 0 ; GCN-NEXT: s_getpc_b64 s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir index 542c756a67570..aadd9e79ff617 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # Check that we allocate 2 emergency stack slots if we're spilling # SGPRs to memory and potentially have an offset larger than fits in @@ -29,7 +29,7 @@ body: | ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr2 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr2 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 03a538e975bef..fc28fd7575040 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,10 +16,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s14, s33 +; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -133,20 +133,13 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[12:13] +; GCN-NEXT: v_writelane_b32 v255, s30, 0 +; GCN-NEXT: v_writelane_b32 v255, s31, 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 @@ -157,8 +150,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v0, 1 -; GCN-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-NEXT: v_readlane_b32 s31, v255, 1 +; GCN-NEXT: v_readlane_b32 s30, v255, 0 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -270,11 +263,11 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s14 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -315,10 +308,10 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s14, s33 +; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -431,20 +424,13 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[12:13] +; GCN-NEXT: v_writelane_b32 v254, s30, 0 +; GCN-NEXT: v_writelane_b32 v254, s31, 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 @@ -455,8 +441,8 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v0, 1 -; GCN-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-NEXT: v_readlane_b32 s31, v254, 1 +; GCN-NEXT: v_readlane_b32 s30, v254, 0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -567,11 +553,11 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s14 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -612,8 +598,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-LABEL: spill_sgpr_with_sgpr_uses: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill @@ -733,18 +719,10 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_writelane_b32 v254, s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s4, v254, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND @@ -859,8 +837,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1183,8 +1161,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1298,54 +1275,45 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_writelane_b32 v4, s34, 0 ; GCN-NEXT: v_writelane_b32 v4, s35, 1 ; GCN-NEXT: v_writelane_b32 v4, s36, 2 ; GCN-NEXT: v_writelane_b32 v4, s37, 3 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec -; GCN-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-NEXT: v_mov_b32_e32 v3, v5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-NEXT: flat_load_dwordx4 v[3:6], v[3:4] +; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx4 v[1:2], v[3:6] -; GCN-NEXT: v_readlane_b32 s37, v0, 3 -; GCN-NEXT: v_readlane_b32 s36, v0, 2 -; GCN-NEXT: v_readlane_b32 s35, v0, 1 -; GCN-NEXT: v_readlane_b32 s34, v0, 0 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GCN-NEXT: v_readlane_b32 s37, v4, 3 +; GCN-NEXT: v_readlane_b32 s36, v4, 2 +; GCN-NEXT: v_readlane_b32 s35, v4, 1 +; GCN-NEXT: v_readlane_b32 s34, v4, 0 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -1459,8 +1427,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1541,11 +1508,8 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s14, s33 +; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -1659,11 +1623,21 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 s[14:15], exec +; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 +; GCN-NEXT: v_writelane_b32 v1, s30, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[14:15] +; GCN-NEXT: s_mov_b64 s[12:13], exec +; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 +; GCN-NEXT: v_writelane_b32 v0, s31, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function_ipra@rel32@lo+4 @@ -1673,12 +1647,24 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[0:1], s[8:9] ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v1, 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v0, 1 ; GCN-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -1791,11 +1777,8 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s14 +; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index 5eb0ec734cf2e..f82b9e4637bac 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,16 +6,16 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: v_mov_b32_e32 v0, vcc_lo -; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload -; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: s_nop 4 +; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 +; SGPR-NEXT: s_waitcnt vmcnt(0) +; SGPR-NEXT: s_mov_b64 exec, s[4:5] +; SGPR-NEXT: s_nop 1 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; ALL: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index c178fa9174762..1c426e281df31 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -212,15 +212,15 @@ entry: ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index 601c36f3146a8..8d49223215ca5 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,22 +2,20 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN: s_or_saveexec_b64 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 +; GCN: s_or_saveexec_b64 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 65292eb96c694..67ed07878b7f2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -117,16 +117,16 @@ define void @test_sgpr_offset_function_scavenge_fail_func() #2 { ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 +; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -199,15 +199,16 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s10, 0x40100 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 +; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -638,5 +639,5 @@ entry: attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } -attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } -attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } +attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir index c16342579401a..a4400670ab550 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted # by the copy propagation after lowering the spill. @@ -26,12 +26,11 @@ body: | ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -64,11 +63,10 @@ body: | ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -95,12 +93,12 @@ body: | ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) - ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -125,11 +123,11 @@ body: | ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 2aba719a03a50..db8b2c4371c38 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10085,25 +10085,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s38, 0 +; GFX6-NEXT: s_mov_b32 s39, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 -; GFX6-NEXT: s_mov_b64 s[34:35], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83400 +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10111,7 +10102,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10120,7 +10111,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10129,7 +10120,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10138,7 +10129,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10147,7 +10138,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10156,7 +10147,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10165,7 +10156,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10174,7 +10165,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10183,7 +10174,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10192,8 +10183,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x80800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10201,32 +10192,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x80400 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v9, s0, 0 -; GFX6-NEXT: v_writelane_b32 v9, s1, 1 -; GFX6-NEXT: v_writelane_b32 v9, s2, 2 -; GFX6-NEXT: v_writelane_b32 v9, s3, 3 -; GFX6-NEXT: s_mov_b32 s8, 0x80400 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s8 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[34:35] -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[36:39], 0 addr64 offset:32 +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[36:39], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, 1 @@ -10234,7 +10219,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10246,12 +10230,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] @@ -10269,211 +10253,272 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s[2:3] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; def s[36:37] +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[38:39], exec +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v9, s8, 0 +; GFX6-NEXT: v_writelane_b32 v9, s9, 1 +; GFX6-NEXT: v_writelane_b32 v9, s10, 2 +; GFX6-NEXT: v_writelane_b32 v9, s11, 3 +; GFX6-NEXT: v_writelane_b32 v9, s12, 4 +; GFX6-NEXT: v_writelane_b32 v9, s13, 5 +; GFX6-NEXT: v_writelane_b32 v9, s14, 6 +; GFX6-NEXT: v_writelane_b32 v9, s15, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 +; GFX6-NEXT: buffer_store_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: v_mov_b32_e32 v4, 0x20e0 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readlane_b32 s8, v8, 0 +; GFX6-NEXT: v_readlane_b32 s9, v8, 1 +; GFX6-NEXT: v_readlane_b32 s10, v8, 2 +; GFX6-NEXT: v_readlane_b32 s11, v8, 3 +; GFX6-NEXT: v_readlane_b32 s12, v8, 4 +; GFX6-NEXT: v_readlane_b32 s13, v8, 5 +; GFX6-NEXT: v_readlane_b32 s14, v8, 6 +; GFX6-NEXT: v_readlane_b32 s15, v8, 7 +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v7, s8, 0 -; GFX6-NEXT: v_writelane_b32 v7, s9, 1 -; GFX6-NEXT: v_writelane_b32 v7, s10, 2 -; GFX6-NEXT: v_writelane_b32 v7, s11, 3 -; GFX6-NEXT: v_writelane_b32 v7, s12, 4 -; GFX6-NEXT: v_writelane_b32 v7, s13, 5 -; GFX6-NEXT: v_writelane_b32 v7, s14, 6 -; GFX6-NEXT: v_writelane_b32 v7, s15, 7 -; GFX6-NEXT: s_mov_b32 s36, 0x84400 -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s36 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v7, s16, 0 +; GFX6-NEXT: v_writelane_b32 v7, s17, 1 +; GFX6-NEXT: v_writelane_b32 v7, s18, 2 +; GFX6-NEXT: v_writelane_b32 v7, s19, 3 +; GFX6-NEXT: v_writelane_b32 v7, s20, 4 +; GFX6-NEXT: v_writelane_b32 v7, s21, 5 +; GFX6-NEXT: v_writelane_b32 v7, s22, 6 +; GFX6-NEXT: v_writelane_b32 v7, s23, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 +; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[38:39] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s36, 0x83c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s8, v4, 0 -; GFX6-NEXT: v_readlane_b32 s9, v4, 1 -; GFX6-NEXT: v_readlane_b32 s10, v4, 2 -; GFX6-NEXT: v_readlane_b32 s11, v4, 3 -; GFX6-NEXT: v_readlane_b32 s12, v4, 4 -; GFX6-NEXT: v_readlane_b32 s13, v4, 5 -; GFX6-NEXT: v_readlane_b32 s14, v4, 6 -; GFX6-NEXT: v_readlane_b32 s15, v4, 7 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s16, v9, 0 +; GFX6-NEXT: v_readlane_b32 s17, v9, 1 +; GFX6-NEXT: v_readlane_b32 s18, v9, 2 +; GFX6-NEXT: v_readlane_b32 s19, v9, 3 +; GFX6-NEXT: v_readlane_b32 s20, v9, 4 +; GFX6-NEXT: v_readlane_b32 s21, v9, 5 +; GFX6-NEXT: v_readlane_b32 s22, v9, 6 +; GFX6-NEXT: v_readlane_b32 s23, v9, 7 +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[38:39], exec +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v8, s16, 0 -; GFX6-NEXT: v_writelane_b32 v8, s17, 1 -; GFX6-NEXT: v_writelane_b32 v8, s18, 2 -; GFX6-NEXT: v_writelane_b32 v8, s19, 3 -; GFX6-NEXT: v_writelane_b32 v8, s20, 4 -; GFX6-NEXT: v_writelane_b32 v8, s21, 5 -; GFX6-NEXT: v_writelane_b32 v8, s22, 6 -; GFX6-NEXT: v_writelane_b32 v8, s23, 7 -; GFX6-NEXT: s_mov_b32 s36, 0x84c00 -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s36 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v8, s24, 0 +; GFX6-NEXT: v_writelane_b32 v8, s25, 1 +; GFX6-NEXT: v_writelane_b32 v8, s26, 2 +; GFX6-NEXT: v_writelane_b32 v8, s27, 3 +; GFX6-NEXT: v_writelane_b32 v8, s28, 4 +; GFX6-NEXT: v_writelane_b32 v8, s29, 5 +; GFX6-NEXT: v_writelane_b32 v8, s30, 6 +; GFX6-NEXT: v_writelane_b32 v8, s31, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 +; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[38:39] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s36, 0x84400 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s16, v7, 0 -; GFX6-NEXT: v_readlane_b32 s17, v7, 1 -; GFX6-NEXT: v_readlane_b32 s18, v7, 2 -; GFX6-NEXT: v_readlane_b32 s19, v7, 3 -; GFX6-NEXT: v_readlane_b32 s20, v7, 4 -; GFX6-NEXT: v_readlane_b32 s21, v7, 5 -; GFX6-NEXT: v_readlane_b32 s22, v7, 6 -; GFX6-NEXT: v_readlane_b32 s23, v7, 7 +; GFX6-NEXT: v_readlane_b32 s24, v7, 0 +; GFX6-NEXT: v_readlane_b32 s25, v7, 1 +; GFX6-NEXT: v_readlane_b32 s26, v7, 2 +; GFX6-NEXT: v_readlane_b32 s27, v7, 3 +; GFX6-NEXT: v_readlane_b32 s28, v7, 4 +; GFX6-NEXT: v_readlane_b32 s29, v7, 5 +; GFX6-NEXT: v_readlane_b32 s30, v7, 6 +; GFX6-NEXT: v_readlane_b32 s31, v7, 7 ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[38:39], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s24, 0 -; GFX6-NEXT: v_writelane_b32 v4, s25, 1 -; GFX6-NEXT: v_writelane_b32 v4, s26, 2 -; GFX6-NEXT: v_writelane_b32 v4, s27, 3 -; GFX6-NEXT: v_writelane_b32 v4, s28, 4 -; GFX6-NEXT: v_writelane_b32 v4, s29, 5 -; GFX6-NEXT: v_writelane_b32 v4, s30, 6 -; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s36, 0x85400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v10, s0, 0 +; GFX6-NEXT: v_writelane_b32 v10, s1, 1 +; GFX6-NEXT: v_writelane_b32 v10, s2, 2 +; GFX6-NEXT: v_writelane_b32 v10, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2160 +; GFX6-NEXT: buffer_store_dword v10, v4, s[40:43], 0 offen ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[38:39] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s36, 0x84c00 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v8, s4, 0 +; GFX6-NEXT: v_writelane_b32 v8, s5, 1 +; GFX6-NEXT: v_writelane_b32 v8, s6, 2 +; GFX6-NEXT: v_writelane_b32 v8, s7, 3 +; GFX6-NEXT: s_mov_b32 s0, 0x85c00 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s24, v9, 0 -; GFX6-NEXT: v_readlane_b32 s25, v9, 1 -; GFX6-NEXT: v_readlane_b32 s26, v9, 2 -; GFX6-NEXT: v_readlane_b32 s27, v9, 3 -; GFX6-NEXT: v_readlane_b32 s28, v9, 4 -; GFX6-NEXT: v_readlane_b32 s29, v9, 5 -; GFX6-NEXT: v_readlane_b32 s30, v9, 6 -; GFX6-NEXT: v_readlane_b32 s31, v9, 7 +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v9, s2, 0 +; GFX6-NEXT: v_writelane_b32 v9, s3, 1 +; GFX6-NEXT: s_mov_b32 s4, 0x86600 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s4 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readlane_b32 s0, v7, 0 +; GFX6-NEXT: v_readlane_b32 s1, v7, 1 +; GFX6-NEXT: v_readlane_b32 s2, v7, 2 +; GFX6-NEXT: v_readlane_b32 s3, v7, 3 +; GFX6-NEXT: v_readlane_b32 s4, v7, 4 +; GFX6-NEXT: v_readlane_b32 s5, v7, 5 +; GFX6-NEXT: v_readlane_b32 s6, v7, 6 +; GFX6-NEXT: v_readlane_b32 s7, v7, 7 +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v8, s0, 0 -; GFX6-NEXT: v_writelane_b32 v8, s1, 1 -; GFX6-NEXT: v_writelane_b32 v8, s2, 2 -; GFX6-NEXT: v_writelane_b32 v8, s3, 3 -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v8, s36, 0 +; GFX6-NEXT: v_writelane_b32 v8, s37, 1 +; GFX6-NEXT: v_writelane_b32 v8, s38, 2 +; GFX6-NEXT: v_writelane_b32 v8, s39, 3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 +; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: s_mov_b64 s[38:39], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s4, 0 -; GFX6-NEXT: v_writelane_b32 v4, s5, 1 -; GFX6-NEXT: v_writelane_b32 v4, s6, 2 -; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s0, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v10, s36, 0 +; GFX6-NEXT: v_writelane_b32 v10, s37, 1 +; GFX6-NEXT: s_mov_b32 s44, 0x86400 +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s44 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[38:39] ; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v7, s2, 0 -; GFX6-NEXT: v_writelane_b32 v7, s3, 1 -; GFX6-NEXT: s_mov_b32 s0, 0x86400 -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2170 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readlane_b32 s36, v9, 0 +; GFX6-NEXT: v_readlane_b32 s37, v9, 1 +; GFX6-NEXT: v_readlane_b32 s38, v9, 2 +; GFX6-NEXT: v_readlane_b32 s39, v9, 3 +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[36:37], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x85400 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_not_b64 exec, exec ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v9, 0 -; GFX6-NEXT: v_readlane_b32 s1, v9, 1 -; GFX6-NEXT: v_readlane_b32 s2, v9, 2 -; GFX6-NEXT: v_readlane_b32 s3, v9, 3 -; GFX6-NEXT: v_readlane_b32 s4, v9, 4 -; GFX6-NEXT: v_readlane_b32 s5, v9, 5 -; GFX6-NEXT: v_readlane_b32 s6, v9, 6 -; GFX6-NEXT: v_readlane_b32 s7, v9, 7 -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s44, v7, 0 +; GFX6-NEXT: v_readlane_b32 s45, v7, 1 +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: s_mov_b64 vcc, s[34:35] +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 +; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: s_not_b64 exec, exec ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s36, v8, 0 -; GFX6-NEXT: v_readlane_b32 s37, v8, 1 -; GFX6-NEXT: v_readlane_b32 s38, v8, 2 -; GFX6-NEXT: v_readlane_b32 s39, v8, 3 +; GFX6-NEXT: v_readlane_b32 s34, v8, 0 +; GFX6-NEXT: v_readlane_b32 s35, v8, 1 ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 vcc, s[34:35] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 +; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35],s[44:45] +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[34:35], vcc +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: s_mov_b32 s0, 0x86000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s34, v4, 0 -; GFX6-NEXT: v_readlane_b32 s35, v4, 1 +; GFX6-NEXT: v_readlane_b32 s36, v4, 0 +; GFX6-NEXT: v_readlane_b32 s37, v4, 1 +; GFX6-NEXT: v_readlane_b32 s38, v4, 2 +; GFX6-NEXT: v_readlane_b32 s39, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b64 s[34:35], vcc +; GFX6-NEXT: s_mov_b64 exec, s[8:9] ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85c00 +; GFX6-NEXT: s_mov_b32 s6, 0x85800 ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s6 ; 4-byte Folded Reload @@ -10485,19 +10530,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: s_mov_b32 s2, 0x84000 ; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 +; GFX6-NEXT: s_mov_b32 s2, 0x84800 ; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill @@ -10510,12 +10555,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: s_mov_b32 s2, 0x84000 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10534,28 +10579,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s2, 0x80400 -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s4, v10, 0 -; GFX6-NEXT: v_readlane_b32 s5, v10, 1 -; GFX6-NEXT: v_readlane_b32 s6, v10, 2 -; GFX6-NEXT: v_readlane_b32 s7, v10, 3 -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: s_mov_b32 s4, 0x83800 +; GFX6-NEXT: s_mov_b32 s4, 0x83400 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_mov_b32 s4, 0x83400 +; GFX6-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10563,7 +10594,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x83000 +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10571,7 +10602,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82c00 +; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10579,7 +10610,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82800 +; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10587,7 +10618,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82400 +; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10595,7 +10626,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82000 +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10603,7 +10634,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81c00 +; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10611,7 +10642,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81800 +; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10619,7 +10650,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81400 +; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10627,7 +10658,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81000 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10635,7 +10666,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80800 +; GFX6-NEXT: s_mov_b32 s4, 0x80400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10643,7 +10674,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 @@ -10745,13 +10776,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ; def s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; def s[44:45] +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] +; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39],s[44:45] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill @@ -10896,13 +10930,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; def s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s38 +; GFX10-FLATSCR-NEXT: ; def s[38:39] +; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: ;;#ASMSTART +; GFX10-FLATSCR-NEXT: ; def s44 ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] +; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35],s[38:39] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 @@ -11079,14 +11116,15 @@ entry: %sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () %sgpr4 = call <4 x i32> asm sideeffect "; def $0", "=s" () %sgpr5 = call <2 x i32> asm sideeffect "; def $0", "=s" () - %sgpr6 = call i32 asm sideeffect "; def $0", "=s" () + %sgpr6 = call <2 x i32> asm sideeffect "; def $0", "=s" () + %sgpr7 = call i32 asm sideeffect "; def $0", "=s" () %cmp = icmp eq i32 %x, 0 br i1 %cmp, label %bb0, label %ret bb0: ; create SGPR pressure - call void asm sideeffect "; use $0,$1,$2,$3,$4,$5", "s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, i32 %sgpr6) + call void asm sideeffect "; use $0,$1,$2,$3,$4,$5,$6", "s,s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, <2 x i32> %sgpr6, i32 %sgpr7) ; mark most VGPR registers as used to increase register pressure call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir index 32710d1be2234..89c305b82b451 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir @@ -10,10 +10,9 @@ body: | bb.0: liveins: $sgpr50 ; CHECK-LABEL: name: spill_csr_sgpr_argument - ; CHECK: liveins: $sgpr50 + ; CHECK: liveins: $sgpr50, $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr50, 0, [[V_WRITELANE_B32_]] + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr50, 0, $vgpr0 ; CHECK-NEXT: S_NOP 0, implicit $sgpr50 ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0 S_NOP 0, implicit $sgpr50 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll index a636abe38c69c..91d2ec82c81e7 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,50 +1,59 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; The test was originally written to spill an SGPR to scratch without having spare SGPRs available to save exec. -; This scenario no longer exists when we enabled SGPR spill into virtual VGPRs. +; Spill an SGPR to scratch without having spare SGPRs available to save exec define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s14, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX10-NEXT: s_add_u32 s12, s12, s1 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX10-NEXT: s_add_u32 s8, s8, s1 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[0:7] ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[8:12] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NOT: s_not_b64 exec, exec -; GFX10-NEXT: ; implicit-def: $vgpr0 +; GFX10-NEXT: s_not_b64 exec, exec +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX10-NEXT: v_writelane_b32 v0, s8, 0 ; GFX10-NEXT: v_writelane_b32 v0, s9, 1 ; GFX10-NEXT: v_writelane_b32 v0, s10, 2 ; GFX10-NEXT: v_writelane_b32 v0, s11, 3 ; GFX10-NEXT: v_writelane_b32 v0, s12, 4 -; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 exec, s[14:15] +; GFX10-NEXT: s_not_b64 exec, exec +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_not_b64 exec, exec +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_not_b64 exec, exec ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:7] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 exec, s[14:15] +; GFX10-NEXT: s_mov_b64 s[6:7], exec +; GFX10-NEXT: s_mov_b64 exec, 31 +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readlane_b32 s0, v0, 0 ; GFX10-NEXT: v_readlane_b32 s1, v0, 1 ; GFX10-NEXT: v_readlane_b32 s2, v0, 2 ; GFX10-NEXT: v_readlane_b32 s3, v0, 3 ; GFX10-NEXT: v_readlane_b32 s4, v0, 4 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b64 exec, s[6:7] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:4] ; GFX10-NEXT: ;;#ASMEND @@ -58,4 +67,4 @@ define amdgpu_kernel void @test() #1 { } attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir deleted file mode 100644 index c5ad6d4dffe90..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ /dev/null @@ -1,320 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - -# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. ---- -name: sgpr32_spill -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - bb.0: - liveins: $sgpr30_sgpr31, $sgpr10 - ; GCN-LABEL: name: sgpr32_spill - ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_WRITELANE_B32_]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 - S_NOP 0 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31 -... - -# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. -# There must be two implicit def for the two lane VGPRs. - ---- -name: sgpr_spill_lane_crossover -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } - - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - bb.0: - liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-LABEL: name: sgpr_spill_lane_crossover - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr64, 0, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr65, 1, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr66, 2, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr67, 3, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr68, 4, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr69, 5, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr70, 6, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr71, 7, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr72, 8, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr73, 9, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr74, 10, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr75, 11, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr76, 12, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr77, 13, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr78, 14, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr79, 15, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr80, 16, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr81, 17, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr82, 18, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr83, 19, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr84, 20, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr85, 21, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr86, 22, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr87, 23, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr88, 24, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr89, 25, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr90, 26, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr91, 27, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr92, 28, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr93, 29, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr94, 30, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 31, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 32, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 33, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 34, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 35, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 36, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 37, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 38, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 39, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 40, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 41, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 42, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 43, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 44, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 45, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 46, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 47, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 48, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 49, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 50, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 51, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 52, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 53, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 54, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 55, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 56, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 57, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 58, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 59, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 60, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 61, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 62, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 63, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_2]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 0, [[V_WRITELANE_B32_2]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 33, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 34 - ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 35 - ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 36 - ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 37 - ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 38 - ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 39 - ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 40 - ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 41 - ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 42 - ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 43 - ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 44 - ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 45 - ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 46 - ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 47 - ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 48 - ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 49 - ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 50 - ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 51 - ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 52 - ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 53 - ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 54 - ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 55 - ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 56 - ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 57 - ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 58 - ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 59 - ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 60 - ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 61 - ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 62 - ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 63 - ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_2]], 0 - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 - S_NOP 0 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31 -... - -# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). - ---- -name: lane_vgpr_implicit_def_at_common_dominator_block -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 - bb.0: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - S_NOP 0 - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - S_CBRANCH_SCC1 %bb.2, implicit killed $scc - bb.1: - liveins: $sgpr10, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 10 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_BRANCH %bb.3 - bb.2: - liveins: $sgpr10, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 20 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_BRANCH %bb.3 - bb.3: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 -... - -# The common dominator block is visited only at the end. The insertion point was initially identified to the -# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. - ---- -name: dominator_block_follows_the_successors_bbs -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc - ; GCN-NEXT: S_BRANCH %bb.2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4: - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 - bb.0: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - S_NOP 0 - S_BRANCH %bb.3 - bb.1: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc - S_BRANCH %bb.2 - bb.2: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc - S_BRANCH %bb.3 - bb.3: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 10 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - S_CBRANCH_SCC1 %bb.2, implicit killed $scc - S_BRANCH %bb.1 - bb.4: - liveins: $sgpr10, $sgpr30_sgpr31 - S_NOP 0 - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 -... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll deleted file mode 100644 index 71cd094f743cc..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ /dev/null @@ -1,85 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s - -; Regression test for `processFunctionBeforeFrameFinalized`: -; Check that it correctly updates RegisterScavenger so we -; don't end up with bad machine code due to using undefined -; physical registers. - -define void @test() { -; CHECK-LABEL: test: -; CHECK: ; %bb.0: ; %bb.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: .LBB0_1: ; %bb.1 -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 -; CHECK-NEXT: ; %bb.2: ; %bb.2 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: .LBB0_3: ; %bb.3 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: s_mov_b64 s[10:11], exec -; CHECK-NEXT: s_mov_b64 exec, -1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 -; CHECK-NEXT: ; %bb.4: ; %bb.4 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: .LBB0_5: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CHECK-NEXT: s_mov_b32 s4, 1 -; CHECK-NEXT: ; implicit-def: $sgpr5 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 -; CHECK-NEXT: ; %bb.6: ; %bb.5 -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] -bb.0: - br label %bb.1 -bb.1: ; preds = %bb.4, %bb.0 - br i1 poison, label %bb.2, label %bb.3 -bb.2: ; preds = %bb.1 - br label %bb.3 -bb.3: ; preds = %bb.2, %bb.1 - %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison) - %cmp = icmp eq i32 %call, 0 - br i1 %cmp, label %bb.5, label %bb.4 -bb.4: ; preds = %bb.3 - br label %bb.1 -bb.5: ; preds = %bb.3 - ret void -} - -declare i32 @llvm.amdgcn.readfirstlane(i32) diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll index e8a46bd72aec2..c1fc297d45643 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -10,17 +10,16 @@ define void @sgpr_spill_writelane() { ; GCN-LABEL: sgpr_spill_writelane: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s35, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35}"() diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir index 6fd96da2318b0..df0c836b556e2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -32,29 +32,32 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 bb.0: S_NOP 0, implicit-def %0:sgpr_192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir index 7f4402deadd81..09f208246995c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -30,31 +30,34 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr224 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 bb.0: S_NOP 0, implicit-def %0:sgpr_224 diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir index 646d2be19e8a0..6e6b7f8bd7e32 100644 --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -30,35 +30,38 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr288 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 8, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 bb.0: S_NOP 0, implicit-def %0:sgpr_288 diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir index 7c866cd15f903..a5a1074d1e7dd 100644 --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -30,37 +30,40 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr320 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr13, 9, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 bb.0: S_NOP 0, implicit-def %0:sgpr_320 diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir index ded934f94d320..12a15152e6a10 100644 --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -30,39 +30,42 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr352 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr14, 10, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 bb.0: S_NOP 0, implicit-def %0:sgpr_352 diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir index b929391f67c0a..6029ff52e9b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -30,41 +30,44 @@ body: | ; EXPANDED-LABEL: name: spill_restore_sgpr384 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 10, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr15, 11, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 - ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 + ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 11 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 bb.0: S_NOP 0, implicit-def %0:sgpr_384 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll index d209c02dccc82..e2aaa47c3c0b2 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -22,9 +22,8 @@ define amdgpu_gfx float @caller(float %arg0) { ; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index d9b0106ccc4e5..6297b4136fb4a 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -10,20 +10,18 @@ define internal fastcc void @widget() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s16, s33 ; GFX90A-NEXT: s_mov_b32 s33, s32 -; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: s_mov_b64 exec, -1 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: s_addk_i32 s32, 0x400 -; GFX90A-NEXT: v_writelane_b32 v40, s16, 0 +; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 -; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] bb: @@ -37,12 +35,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[36:37] +; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37] ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -50,11 +48,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 @@ -67,34 +65,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr40 ; GLOBALNESS1-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 ; GLOBALNESS1-NEXT: s_mov_b32 s98, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[62:63], s[8:9] ; GLOBALNESS1-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s56, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s100, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 ; GLOBALNESS1-NEXT: s_mov_b32 s69, 0x3ff00000 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 0 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 1 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 2 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 5 +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 5 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 @@ -144,19 +141,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] ; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 ; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 @@ -223,23 +220,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v40, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v40, 1 -; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc +; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] +; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_15 ; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow7 @@ -260,15 +257,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[90:91] @@ -281,28 +278,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -317,14 +314,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_branch .LBB1_3 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s38, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b32 s40, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[42:43] ; GLOBALNESS1-NEXT: s_mov_b32 s42, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s43, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s44, s93 @@ -353,10 +350,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[6:7] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[6:7] +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] ; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[72:73] +; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] @@ -364,21 +361,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 3 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -390,10 +387,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -408,10 +405,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -423,12 +420,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[36:37] +; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37] ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -436,11 +433,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 @@ -453,34 +450,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr40 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 ; GLOBALNESS0-NEXT: s_mov_b32 s98, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[8:9] ; GLOBALNESS0-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s56, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s100, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 ; GLOBALNESS0-NEXT: s_mov_b32 s69, 0x3ff00000 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 @@ -530,19 +526,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] ; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[36:37] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 @@ -609,23 +605,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v40, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v40, 1 -; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc +; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] +; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow7 @@ -646,15 +642,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[90:91] @@ -667,28 +663,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -703,14 +699,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_3 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s38, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b32 s40, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[42:43] ; GLOBALNESS0-NEXT: s_mov_b32 s42, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s43, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s44, s93 @@ -739,10 +735,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[6:7] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[6:7] +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] ; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] @@ -750,21 +746,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -776,10 +772,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -794,10 +790,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 0988921a2452b..6dc9b9d0f7e95 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -9,30 +9,29 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v42, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: v_writelane_b32 v41, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s31, 1 -; GCN-NEXT: v_writelane_b32 v41, s34, 2 -; GCN-NEXT: v_writelane_b32 v41, s35, 3 -; GCN-NEXT: v_writelane_b32 v41, s36, 4 -; GCN-NEXT: v_writelane_b32 v41, s37, 5 -; GCN-NEXT: v_writelane_b32 v41, s38, 6 -; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s40, 8 -; GCN-NEXT: v_writelane_b32 v41, s41, 9 -; GCN-NEXT: v_writelane_b32 v41, s42, 10 -; GCN-NEXT: v_writelane_b32 v41, s43, 11 -; GCN-NEXT: v_writelane_b32 v41, s44, 12 -; GCN-NEXT: v_writelane_b32 v41, s45, 13 -; GCN-NEXT: v_writelane_b32 v41, s46, 14 -; GCN-NEXT: v_writelane_b32 v41, s47, 15 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s36, 4 +; GCN-NEXT: v_writelane_b32 v40, s37, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 6 +; GCN-NEXT: v_writelane_b32 v40, s39, 7 +; GCN-NEXT: v_writelane_b32 v40, s40, 8 +; GCN-NEXT: v_writelane_b32 v40, s41, 9 +; GCN-NEXT: v_writelane_b32 v40, s42, 10 +; GCN-NEXT: v_writelane_b32 v40, s43, 11 +; GCN-NEXT: v_writelane_b32 v40, s44, 12 +; GCN-NEXT: v_writelane_b32 v40, s45, 13 +; GCN-NEXT: v_writelane_b32 v40, s46, 14 +; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s42, s15 ; GCN-NEXT: s_mov_b32 s43, s14 ; GCN-NEXT: s_mov_b32 s44, s13 @@ -69,7 +68,7 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s13, s44 ; GCN-NEXT: s_mov_b32 s14, s43 ; GCN-NEXT: s_mov_b32 s15, s42 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 @@ -101,29 +100,29 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s13, s44 ; GCN-NEXT: s_mov_b32 s14, s43 ; GCN-NEXT: s_mov_b32 s15, s42 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s47, v41, 15 -; GCN-NEXT: v_readlane_b32 s46, v41, 14 -; GCN-NEXT: v_readlane_b32 s45, v41, 13 -; GCN-NEXT: v_readlane_b32 s44, v41, 12 -; GCN-NEXT: v_readlane_b32 s43, v41, 11 -; GCN-NEXT: v_readlane_b32 s42, v41, 10 -; GCN-NEXT: v_readlane_b32 s41, v41, 9 -; GCN-NEXT: v_readlane_b32 s40, v41, 8 -; GCN-NEXT: v_readlane_b32 s39, v41, 7 -; GCN-NEXT: v_readlane_b32 s38, v41, 6 -; GCN-NEXT: v_readlane_b32 s37, v41, 5 -; GCN-NEXT: v_readlane_b32 s36, v41, 4 -; GCN-NEXT: v_readlane_b32 s35, v41, 3 -; GCN-NEXT: v_readlane_b32 s34, v41, 2 -; GCN-NEXT: v_readlane_b32 s31, v41, 1 -; GCN-NEXT: v_readlane_b32 s30, v41, 0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s47, v40, 15 +; GCN-NEXT: v_readlane_b32 s46, v40, 14 +; GCN-NEXT: v_readlane_b32 s45, v40, 13 +; GCN-NEXT: v_readlane_b32 s44, v40, 12 +; GCN-NEXT: v_readlane_b32 s43, v40, 11 +; GCN-NEXT: v_readlane_b32 s42, v40, 10 +; GCN-NEXT: v_readlane_b32 s41, v40, 9 +; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s39, v40, 7 +; GCN-NEXT: v_readlane_b32 s38, v40, 6 +; GCN-NEXT: v_readlane_b32 s37, v40, 5 +; GCN-NEXT: v_readlane_b32 s36, v40, 4 +; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: v_readlane_b32 s4, v42, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 @@ -268,38 +267,36 @@ define hidden void @blam() { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v45, s16, 0 +; GCN-NEXT: v_writelane_b32 v46, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: v_writelane_b32 v0, s34, 2 -; GCN-NEXT: v_writelane_b32 v0, s35, 3 -; GCN-NEXT: v_writelane_b32 v0, s36, 4 -; GCN-NEXT: v_writelane_b32 v0, s37, 5 -; GCN-NEXT: v_writelane_b32 v0, s38, 6 -; GCN-NEXT: v_writelane_b32 v0, s39, 7 -; GCN-NEXT: v_writelane_b32 v0, s40, 8 -; GCN-NEXT: v_writelane_b32 v0, s41, 9 -; GCN-NEXT: v_writelane_b32 v0, s42, 10 -; GCN-NEXT: v_writelane_b32 v0, s43, 11 -; GCN-NEXT: v_writelane_b32 v0, s44, 12 -; GCN-NEXT: v_writelane_b32 v0, s45, 13 -; GCN-NEXT: v_writelane_b32 v0, s46, 14 -; GCN-NEXT: v_writelane_b32 v0, s47, 15 -; GCN-NEXT: v_writelane_b32 v0, s48, 16 -; GCN-NEXT: v_writelane_b32 v0, s49, 17 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s36, 4 +; GCN-NEXT: v_writelane_b32 v40, s37, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 6 +; GCN-NEXT: v_writelane_b32 v40, s39, 7 +; GCN-NEXT: v_writelane_b32 v40, s40, 8 +; GCN-NEXT: v_writelane_b32 v40, s41, 9 +; GCN-NEXT: v_writelane_b32 v40, s42, 10 +; GCN-NEXT: v_writelane_b32 v40, s43, 11 +; GCN-NEXT: v_writelane_b32 v40, s44, 12 +; GCN-NEXT: v_writelane_b32 v40, s45, 13 +; GCN-NEXT: v_writelane_b32 v40, s46, 14 +; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 16 +; GCN-NEXT: v_writelane_b32 v40, s49, 17 +; GCN-NEXT: v_mov_b32_e32 v41, v31 ; GCN-NEXT: s_mov_b32 s44, s15 ; GCN-NEXT: s_mov_b32 s45, s14 ; GCN-NEXT: s_mov_b32 s46, s13 @@ -311,24 +308,24 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: flat_load_dword v43, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: flat_load_dword v44, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 ; GCN-NEXT: s_getpc_b64 s[48:49] ; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v43 +; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44 ; GCN-NEXT: s_branch .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: .LBB1_2: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_3: ; %bb2 ; GCN-NEXT: ; =>This Loop Header: Depth=1 @@ -337,8 +334,8 @@ define hidden void @blam() { ; GCN-NEXT: .LBB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[41:42] -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GCN-NEXT: flat_load_dword v0, v[42:43] +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -370,7 +367,7 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b32 s13, s46 ; GCN-NEXT: s_mov_b32 s14, s45 ; GCN-NEXT: s_mov_b32 s15, s44 -; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: v_mov_b32_e32 v31, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 @@ -385,10 +382,10 @@ define hidden void @blam() { ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: .LBB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB1_2 bb: %tmp = load float, ptr null, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index a97c436d303bb..7285510a5f895 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,7 +14,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 @@ -22,10 +22,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART @@ -34,31 +34,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v45, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 -; GFX9-NEXT: v_writelane_b32 v44, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v40 -; GFX9-NEXT: v_mov_b32_e32 v1, v41 -; GFX9-NEXT: v_mov_b32_e32 v2, v42 -; GFX9-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v44, 1 -; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-NEXT: v_mov_b32_e32 v1, v42 +; GFX9-NEXT: v_mov_b32_e32 v2, v43 +; GFX9-NEXT: v_mov_b32_e32 v3, v44 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -73,7 +72,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -82,10 +81,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART @@ -94,33 +93,32 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v45, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr44 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v44, s30, 0 -; GFX10-NEXT: v_writelane_b32 v44, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v40 -; GFX10-NEXT: v_mov_b32_e32 v1, v41 -; GFX10-NEXT: v_mov_b32_e32 v2, v42 -; GFX10-NEXT: v_mov_b32_e32 v3, v43 +; GFX10-NEXT: v_mov_b32_e32 v0, v41 +; GFX10-NEXT: v_mov_b32_e32 v1, v42 +; GFX10-NEXT: v_mov_b32_e32 v2, v43 +; GFX10-NEXT: v_mov_b32_e32 v3, v44 ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v44, 1 -; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s4, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -137,17 +135,17 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART @@ -156,31 +154,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v45, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v44, s30, 0 -; GFX11-NEXT: v_writelane_b32 v44, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41 -; GFX11-NEXT: v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43 +; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 +; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 -; GFX11-NEXT: v_readlane_b32 s31, v44, 1 -; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v45, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 @@ -218,45 +215,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v44, v16 -; GFX9-NEXT: v_mov_b32_e32 v43, v15 -; GFX9-NEXT: v_mov_b32_e32 v42, v14 -; GFX9-NEXT: v_mov_b32_e32 v41, v13 -; GFX9-NEXT: v_mov_b32_e32 v40, v12 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v45, v16 +; GFX9-NEXT: v_mov_b32_e32 v44, v15 +; GFX9-NEXT: v_mov_b32_e32 v43, v14 +; GFX9-NEXT: v_mov_b32_e32 v42, v13 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v46, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: v_writelane_b32 v45, s30, 0 -; GFX9-NEXT: v_writelane_b32 v45, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v45, 1 -; GFX9-NEXT: v_readlane_b32 s30, v45, 0 +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -271,47 +267,46 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v46, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: ; implicit-def: $vgpr45 -; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v45, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-NEXT: v_mov_b32_e32 v42, v14 -; GFX10-NEXT: v_mov_b32_e32 v43, v13 -; GFX10-NEXT: v_mov_b32_e32 v44, v12 -; GFX10-NEXT: v_writelane_b32 v45, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v16 +; GFX10-NEXT: v_mov_b32_e32 v42, v15 +; GFX10-NEXT: v_mov_b32_e32 v43, v14 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 +; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s31, v45, 1 -; GFX10-NEXT: v_readlane_b32 s30, v45, 0 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s4, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -328,45 +323,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 ; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v46, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: ; implicit-def: $vgpr45 -; GFX11-NEXT: v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v45, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13 -; GFX11-NEXT: v_mov_b32_e32 v44, v12 -; GFX11-NEXT: v_writelane_b32 v45, s31, 1 +; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 +; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_mov_b32_e32 v45, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s31, v45, 1 -; GFX11-NEXT: v_readlane_b32 s30, v45, 0 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v46, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 ; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index d721bf48e4453..09106f0da591a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -16,11 +16,10 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll deleted file mode 100644 index d3144b2648fc8..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll +++ /dev/null @@ -1,166 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=virtregrewriter,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -stop-after=regallocfast,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL-O0 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s - -; Test whole-wave register spilling. - -; In the testcase, the return address registers (SGPR30_SGPR31) should be preserved across the call. -; Since the test limits the VGPR numbers, they are all in the call-clobber (scratch) range and RA should -; spill any VGPR borrowed for spilling SGPRs. The writelane/readlane instructions that spill/restore -; SGPRs into/from VGPR are whole-wave operations and hence the VGPRs involved in such operations require -; whole-wave spilling. - -define void @test() #0 { -; WWM-SPILL-LABEL: name: test -; WWM-SPILL: bb.0 (%ir-block.0): -; WWM-SPILL-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 -; WWM-SPILL-NEXT: {{ $}} -; WWM-SPILL-NEXT: renamable $vgpr0 = IMPLICIT_DEF -; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr0 -; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr0 -; WWM-SPILL-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) -; WWM-SPILL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 -; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc -; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) -; WWM-SPILL-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 -; WWM-SPILL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 -; WWM-SPILL-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) -; WWM-SPILL-NEXT: $sgpr31 = V_READLANE_B32 $vgpr0, 1 -; WWM-SPILL-NEXT: $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 -; WWM-SPILL-NEXT: SI_RETURN -; -; WWM-SPILL-O0-LABEL: name: test -; WWM-SPILL-O0: bb.0 (%ir-block.0): -; WWM-SPILL-O0-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 -; WWM-SPILL-O0-NEXT: {{ $}} -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = IMPLICIT_DEF -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr0 -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr0 -; WWM-SPILL-O0-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = COPY $vgpr31 -; WWM-SPILL-O0-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 -; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc -; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) -; WWM-SPILL-O0-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 -; WWM-SPILL-O0-NEXT: $vgpr31 = COPY killed renamable $vgpr0 -; WWM-SPILL-O0-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed renamable $sgpr20_sgpr21_sgpr22_sgpr23 -; WWM-SPILL-O0-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 -; WWM-SPILL-O0-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) -; WWM-SPILL-O0-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 -; WWM-SPILL-O0-NEXT: dead $sgpr31 = V_READLANE_B32 $vgpr0, 1 -; WWM-SPILL-O0-NEXT: dead $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 -; WWM-SPILL-O0-NEXT: SI_RETURN -; -; GCN-LABEL: test: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s16, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v0, 1 -; GCN-NEXT: v_readlane_b32 s30, v0, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s34, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s35, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: test: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s16, s33 -; GCN-O0-NEXT: s_mov_b32 s33, s32 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] -; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_mov_b32_e32 v1, s16 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_add_i32 s32, s32, 0x800 -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: v_writelane_b32 v0, s30, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s31, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v31 -; GCN-O0-NEXT: s_getpc_b64 s[16:17] -; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 -; GCN-O0-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 -; GCN-O0-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-O0-NEXT: s_mov_b64 s[22:23], s[2:3] -; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v31, v0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s31, v0, 1 -; GCN-O0-NEXT: v_readlane_b32 s30, v0, 0 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s34, v0 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s35, v0 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 -; GCN-O0-NEXT: s_mov_b32 s33, s4 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - call void @ext_func() - ret void -} - -declare void @ext_func(); - -attributes #0 = { nounwind "amdgpu-num-vgpr"="4" } diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 92e929c5df104..f4765a3286187 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -133,12 +133,10 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -146,17 +144,16 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 +; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -169,25 +166,22 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] +; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -200,23 +194,20 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v4, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v4, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v4, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v4, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v4, 3 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 +; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s34, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 @@ -225,10 +216,9 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -345,17 +335,13 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x800 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 +; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -378,22 +364,18 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s35 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -408,9 +390,8 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: ; implicit-def: $vgpr3 -; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O3-NEXT: s_not_b64 exec, exec @@ -535,39 +516,37 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s44, s33 +; GFX9-O0-NEXT: s_mov_b32 s42, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 +; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 5 +; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 +; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 +; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 +; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 @@ -579,11 +558,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] +; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 +; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -600,20 +576,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 6 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 7 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -623,38 +592,36 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s44 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 +; GFX9-O0-NEXT: s_mov_b32 s33, s42 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -674,9 +641,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: ; implicit-def: $vgpr8 -; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] @@ -912,81 +878,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s30 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s31 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $vgpr11 -; GFX9-O0-NEXT: v_writelane_b32 v11, s64, 0 -; GFX9-O0-NEXT: v_writelane_b32 v11, s65, 1 -; GFX9-O0-NEXT: v_writelane_b32 v11, s66, 2 -; GFX9-O0-NEXT: v_writelane_b32 v11, s67, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v42, s64, 0 +; GFX9-O0-NEXT: v_writelane_b32 v42, s65, 1 +; GFX9-O0-NEXT: v_writelane_b32 v42, s66, 2 +; GFX9-O0-NEXT: v_writelane_b32 v42, s67, 3 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:8 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -994,114 +943,161 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v50, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v53, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v63, s15 -; GFX9-O0-NEXT: v_mov_b32_e32 v62, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v61, s17 -; GFX9-O0-NEXT: v_mov_b32_e32 v60, s18 -; GFX9-O0-NEXT: v_mov_b32_e32 v59, s19 -; GFX9-O0-NEXT: v_mov_b32_e32 v58, s20 -; GFX9-O0-NEXT: v_mov_b32_e32 v57, s21 -; GFX9-O0-NEXT: v_mov_b32_e32 v56, s22 -; GFX9-O0-NEXT: v_mov_b32_e32 v47, s23 -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 -; GFX9-O0-NEXT: ; kill: def $vgpr40 killed $vgpr40 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr54 killed $vgpr54 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr52 killed $vgpr52 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr51 killed $vgpr51 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s20 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s23 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s24 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s27 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s28 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s29 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr45 killed $vgpr45 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr46 killed $vgpr46 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr44 killed $vgpr44 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v50 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v53 -; GFX9-O0-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v63 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v62 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v61 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v60 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v59 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v58 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v57 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v56 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v47 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42 -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41 -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40 -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v55 -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v54 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v53 -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v52 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr51 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v45 +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec @@ -1160,11 +1156,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v48, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v49, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v48, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v49, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 @@ -1182,9 +1178,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v49 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v48 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1212,55 +1208,39 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-O0-NEXT: v_readlane_b32 s67, v50, 3 -; GFX9-O0-NEXT: v_readlane_b32 s66, v50, 2 -; GFX9-O0-NEXT: v_readlane_b32 s65, v50, 1 -; GFX9-O0-NEXT: v_readlane_b32 s64, v50, 0 -; GFX9-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readfirstlane_b32 s30, v50 -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readfirstlane_b32 s31, v50 +; GFX9-O0-NEXT: v_readlane_b32 s67, v42, 3 +; GFX9-O0-NEXT: v_readlane_b32 s66, v42, 2 +; GFX9-O0-NEXT: v_readlane_b32 s65, v42, 1 +; GFX9-O0-NEXT: v_readlane_b32 s64, v42, 0 +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1269,22 +1249,24 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 ; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8 @@ -1315,11 +1297,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v48, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9 +; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v48, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v49, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4 ; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen @@ -1329,8 +1311,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28 ; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24 -; GFX9-O3-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:36 -; GFX9-O3-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:32 +; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36 +; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32 +; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: s_nop 0 +; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6 @@ -1358,25 +1343,25 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -1411,4 +1396,4 @@ declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i3 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) -attributes #0 = { "amdgpu-waves-per-eu"="4,4" } +attributes #0 = { "amdgpu-waves-per-eu"="5,5" } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 3e652170c9707..acff981f98503 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -37,7 +37,6 @@ ; AFTER-PEI-NEXT: occupancy: 5 ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' -; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index c82f5f23a893c..172744e060cbf 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -46,7 +46,6 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' -# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -145,7 +144,6 @@ body: | # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' -# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -215,7 +213,6 @@ body: | # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' -# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -286,7 +283,6 @@ body: | # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' -# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -533,28 +529,3 @@ body: | SI_RETURN ... - ---- -# ALL-LABEL: name: sgpr_for_exec_copy -# ALL: sgprForEXECCopy: '$sgpr2_sgpr3' -name: sgpr_for_exec_copy -machineFunctionInfo: - sgprForEXECCopy: '$sgpr2_sgpr3' -body: | - bb.0: - SI_RETURN - -... - ---- -# ALL-LABEL: name: sgpr_for_exec_copy_noreg -# FULL: sgprForEXECCopy: '' -# SIMPLE-NOT: sgprForEXECCopy -name: sgpr_for_exec_copy_noreg -machineFunctionInfo: - sgprForEXECCopy: '$noreg' -body: | - bb.0: - SI_RETURN - -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 3dc9b8b70db5e..26a35113dae4e 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -40,7 +40,6 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -81,7 +80,6 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -136,7 +134,6 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define void @function() { ret void @@ -183,7 +180,6 @@ define void @function() { ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void diff --git a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir deleted file mode 100644 index 172c388e7cb11..0000000000000 --- a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir +++ /dev/null @@ -1,12 +0,0 @@ -# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s - ---- -name: invalid_reg -machineFunctionInfo: -# ERR: [[@LINE+1]]:21: unknown register name 'srst' - sgprForEXECCopy: '$srst' -body: | - bb.0: - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir index 240c60e72db21..156891fef3625 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir @@ -3,7 +3,7 @@ # contains not dead objects only. So using objects IDs as offset in the storage # caused out of bounds access. -# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s # CHECK-LABEL: name: foo # CHECK: {{^}}fixedStack: []