diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index a02c2a4659082..9d062eb156d5c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister( TargetStackID::SGPRSpill); if (TRI->spillSGPRToVGPR() && - MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { + MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, + /*IsPrologEpilog=*/true)) { // 2: There's no free lane to spill, and no free register to save the // SGPR, so we're forced to take another VGPR to use for the spill. MFI->addToPrologEpilogSGPRSpills( @@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) return; + MFI->shiftSpillPhysVGPRsToLowestRange(MF); + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 70ffb8ea0a622..4b13825040ebe 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -368,7 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // regalloc aware CFI generation to insert new CFIs along with the // intermediate spills is implemented. There is no such support // currently exist in the LLVM compiler. - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { + if (FuncInfo->allocateSGPRSpillToVGPRLane( + MF, FI, /*SpillToPhysVGPRLane=*/true)) { bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS, true); if (!Spilled) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index e8142244b7db6..b94d143a75e5e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } +void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( + MachineFunction &MF) { + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) { + Register Reg = SpillPhysVGPRs[I]; + Register NewReg = + TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (!NewReg || NewReg >= Reg) + break; + + MRI.replaceRegWith(Reg, NewReg); + + // Update various tables with the new VGPR. + SpillPhysVGPRs[I] = NewReg; + WWMReservedRegs.remove(Reg); + WWMReservedRegs.insert(NewReg); + WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg])); + WWMSpills.erase(Reg); + + for (MachineBasicBlock &MBB : MF) { + MBB.removeLiveIn(Reg); + MBB.sortUniqueLiveIns(); + } + } +} + bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( } bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( - MachineFunction &MF, int FI, unsigned LaneIndex) { + MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) { const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + // Find the highest available register if called before RA to ensure the + // lowest registers are available for allocation. The LaneVGPR, in that + // case, will be shifted back to the lowest range after VGPR allocation. + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF, + !IsPrologEpilog); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. @@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( return true; } -bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, - int FI, - bool IsPrologEpilog) { +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane( + MachineFunction &MF, int FI, bool SpillToPhysVGPRLane, + bool IsPrologEpilog) { std::vector &SpillLanes = - IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] - : SGPRSpillsToVirtualVGPRLanes[FI]; + SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, assert(ST.getRegisterInfo()->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - unsigned &NumSpillLanes = - IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; + unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes + : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = IsPrologEpilog - ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + bool Allocated = SpillToPhysVGPRLane + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex, + IsPrologEpilog) : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ecc31fbd9dd3d..9ff66a094f991 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -548,7 +548,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, unsigned LaneIndex); bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + unsigned LaneIndex, + bool IsPrologEpilog); public: Register getVGPRForAGPRCopy() const { @@ -588,6 +589,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, } ArrayRef getSGPRSpillVGPRs() const { return SpillVGPRs; } + const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } @@ -702,7 +704,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, I->second.IsDead = true; } + // To bring the Physical VGPRs in the highest range allocated for CSR SGPR + // spilling into the lowest available range. + void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF); + bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, + bool SpillToPhysVGPRLane = false, bool IsPrologEpilog = false); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index e65eca7810610..bdd7ff11fde63 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -116,38 +116,38 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s16, 14 +; CHECK-NEXT: v_writelane_b32 v43, s30, 0 +; CHECK-NEXT: v_writelane_b32 v43, s31, 1 +; CHECK-NEXT: v_writelane_b32 v43, s34, 2 +; CHECK-NEXT: v_writelane_b32 v43, s35, 3 +; CHECK-NEXT: v_writelane_b32 v43, s36, 4 +; CHECK-NEXT: v_writelane_b32 v43, s37, 5 +; CHECK-NEXT: v_writelane_b32 v43, s38, 6 +; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v43, s40, 8 +; CHECK-NEXT: v_writelane_b32 v43, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v43, v1 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 +; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v42, v1 +; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v2 +; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -156,7 +156,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 @@ -170,32 +170,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 -; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 +; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: v_readlane_b32 s45, v43, 13 +; CHECK-NEXT: v_readlane_b32 s44, v43, 12 +; CHECK-NEXT: v_readlane_b32 s43, v43, 11 +; CHECK-NEXT: v_readlane_b32 s42, v43, 10 +; CHECK-NEXT: v_readlane_b32 s41, v43, 9 +; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s39, v43, 7 +; CHECK-NEXT: v_readlane_b32 s38, v43, 6 +; CHECK-NEXT: v_readlane_b32 s37, v43, 5 +; CHECK-NEXT: v_readlane_b32 s36, v43, 4 +; CHECK-NEXT: v_readlane_b32 s35, v43, 3 +; CHECK-NEXT: v_readlane_b32 s34, v43, 2 +; CHECK-NEXT: v_readlane_b32 s31, v43, 1 +; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 @@ -257,37 +257,37 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s16, 14 +; CHECK-NEXT: v_writelane_b32 v43, s30, 0 +; CHECK-NEXT: v_writelane_b32 v43, s31, 1 +; CHECK-NEXT: v_writelane_b32 v43, s34, 2 +; CHECK-NEXT: v_writelane_b32 v43, s35, 3 +; CHECK-NEXT: v_writelane_b32 v43, s36, 4 +; CHECK-NEXT: v_writelane_b32 v43, s37, 5 +; CHECK-NEXT: v_writelane_b32 v43, s38, 6 +; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v43, s40, 8 +; CHECK-NEXT: v_writelane_b32 v43, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_writelane_b32 v43, s44, 12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v43, v31 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v42, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v3 -; CHECK-NEXT: v_mov_b32_e32 v41, v2 +; CHECK-NEXT: v_mov_b32_e32 v41, v3 +; CHECK-NEXT: v_mov_b32_e32 v40, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -296,7 +296,7 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42] +; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[40:41] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 @@ -309,29 +309,29 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v43 +; CHECK-NEXT: v_mov_b32_e32 v31, v42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s45, v43, 13 +; CHECK-NEXT: v_readlane_b32 s44, v43, 12 +; CHECK-NEXT: v_readlane_b32 s43, v43, 11 +; CHECK-NEXT: v_readlane_b32 s42, v43, 10 +; CHECK-NEXT: v_readlane_b32 s41, v43, 9 +; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s39, v43, 7 +; CHECK-NEXT: v_readlane_b32 s38, v43, 6 +; CHECK-NEXT: v_readlane_b32 s37, v43, 5 +; CHECK-NEXT: v_readlane_b32 s36, v43, 4 +; CHECK-NEXT: v_readlane_b32 s35, v43, 3 +; CHECK-NEXT: v_readlane_b32 s34, v43, 2 +; CHECK-NEXT: v_readlane_b32 s31, v43, 1 +; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 @@ -400,38 +400,38 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s16, 14 +; CHECK-NEXT: v_writelane_b32 v43, s30, 0 +; CHECK-NEXT: v_writelane_b32 v43, s31, 1 +; CHECK-NEXT: v_writelane_b32 v43, s34, 2 +; CHECK-NEXT: v_writelane_b32 v43, s35, 3 +; CHECK-NEXT: v_writelane_b32 v43, s36, 4 +; CHECK-NEXT: v_writelane_b32 v43, s37, 5 +; CHECK-NEXT: v_writelane_b32 v43, s38, 6 +; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v43, s40, 8 +; CHECK-NEXT: v_writelane_b32 v43, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v43, v1 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43 +; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v42, v1 +; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v42, v2 +; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -440,7 +440,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 @@ -454,32 +454,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42 -; CHECK-NEXT: v_and_b32_e32 v2, v2, v43 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 +; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: v_readlane_b32 s45, v43, 13 +; CHECK-NEXT: v_readlane_b32 s44, v43, 12 +; CHECK-NEXT: v_readlane_b32 s43, v43, 11 +; CHECK-NEXT: v_readlane_b32 s42, v43, 10 +; CHECK-NEXT: v_readlane_b32 s41, v43, 9 +; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s39, v43, 7 +; CHECK-NEXT: v_readlane_b32 s38, v43, 6 +; CHECK-NEXT: v_readlane_b32 s37, v43, 5 +; CHECK-NEXT: v_readlane_b32 s36, v43, 4 +; CHECK-NEXT: v_readlane_b32 s35, v43, 3 +; CHECK-NEXT: v_readlane_b32 s34, v43, 2 +; CHECK-NEXT: v_readlane_b32 s31, v43, 1 +; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 @@ -543,34 +543,34 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v42, s16, 14 +; CHECK-NEXT: v_writelane_b32 v42, s30, 0 +; CHECK-NEXT: v_writelane_b32 v42, s31, 1 +; CHECK-NEXT: v_writelane_b32 v42, s34, 2 +; CHECK-NEXT: v_writelane_b32 v42, s35, 3 +; CHECK-NEXT: v_writelane_b32 v42, s36, 4 +; CHECK-NEXT: v_writelane_b32 v42, s37, 5 +; CHECK-NEXT: v_writelane_b32 v42, s38, 6 +; CHECK-NEXT: v_writelane_b32 v42, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v42, s40, 8 +; CHECK-NEXT: v_writelane_b32 v42, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_writelane_b32 v42, s42, 10 +; CHECK-NEXT: v_writelane_b32 v42, s43, 11 +; CHECK-NEXT: v_writelane_b32 v42, s44, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v42, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 @@ -578,10 +578,10 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s45, s12 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v41, 1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 @@ -595,28 +595,28 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s45, v42, 13 +; CHECK-NEXT: v_readlane_b32 s44, v42, 12 +; CHECK-NEXT: v_readlane_b32 s43, v42, 11 +; CHECK-NEXT: v_readlane_b32 s42, v42, 10 +; CHECK-NEXT: v_readlane_b32 s41, v42, 9 +; CHECK-NEXT: v_readlane_b32 s40, v42, 8 +; CHECK-NEXT: v_readlane_b32 s39, v42, 7 +; CHECK-NEXT: v_readlane_b32 s38, v42, 6 +; CHECK-NEXT: v_readlane_b32 s37, v42, 5 +; CHECK-NEXT: v_readlane_b32 s36, v42, 4 +; CHECK-NEXT: v_readlane_b32 s35, v42, 3 +; CHECK-NEXT: v_readlane_b32 s34, v42, 2 +; CHECK-NEXT: v_readlane_b32 s31, v42, 1 +; CHECK-NEXT: v_readlane_b32 s30, v42, 0 +; CHECK-NEXT: v_readlane_b32 s4, v42, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 @@ -685,36 +685,36 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 14 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s16, 14 +; CHECK-NEXT: v_writelane_b32 v43, s30, 0 +; CHECK-NEXT: v_writelane_b32 v43, s31, 1 +; CHECK-NEXT: v_writelane_b32 v43, s34, 2 +; CHECK-NEXT: v_writelane_b32 v43, s35, 3 +; CHECK-NEXT: v_writelane_b32 v43, s36, 4 +; CHECK-NEXT: v_writelane_b32 v43, s37, 5 +; CHECK-NEXT: v_writelane_b32 v43, s38, 6 +; CHECK-NEXT: v_writelane_b32 v43, s39, 7 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 +; CHECK-NEXT: v_writelane_b32 v43, s40, 8 +; CHECK-NEXT: v_writelane_b32 v43, s41, 9 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 +; CHECK-NEXT: v_writelane_b32 v43, s42, 10 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v43, s43, 11 +; CHECK-NEXT: v_mov_b32_e32 v41, v1 +; CHECK-NEXT: v_writelane_b32 v43, s44, 12 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: v_writelane_b32 v43, s45, 13 +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 @@ -722,10 +722,10 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s45, s12 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: v_or_b32_e32 v43, 1, v2 +; CHECK-NEXT: v_or_b32_e32 v42, 1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v43 +; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 @@ -739,31 +739,31 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v42 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v41 +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 14 +; CHECK-NEXT: v_readlane_b32 s45, v43, 13 +; CHECK-NEXT: v_readlane_b32 s44, v43, 12 +; CHECK-NEXT: v_readlane_b32 s43, v43, 11 +; CHECK-NEXT: v_readlane_b32 s42, v43, 10 +; CHECK-NEXT: v_readlane_b32 s41, v43, 9 +; CHECK-NEXT: v_readlane_b32 s40, v43, 8 +; CHECK-NEXT: v_readlane_b32 s39, v43, 7 +; CHECK-NEXT: v_readlane_b32 s38, v43, 6 +; CHECK-NEXT: v_readlane_b32 s37, v43, 5 +; CHECK-NEXT: v_readlane_b32 s36, v43, 4 +; CHECK-NEXT: v_readlane_b32 s35, v43, 3 +; CHECK-NEXT: v_readlane_b32 s34, v43, 2 +; CHECK-NEXT: v_readlane_b32 s31, v43, 1 +; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 4a696879ad7b2..2f7190e761102 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3544,12 +3544,12 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v3, s30, 0 -; GCN-NEXT: v_writelane_b32 v3, s31, 1 +; GCN-NEXT: v_writelane_b32 v4, s30, 0 +; GCN-NEXT: v_writelane_b32 v4, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -3558,15 +3558,15 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v2 -; GCN-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 +; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v3, 1 -; GCN-NEXT: v_readlane_b32 s30, v3, 0 +; GCN-NEXT: v_readlane_b32 s31, v4, 1 +; GCN-NEXT: v_readlane_b32 s30, v4, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -3579,28 +3579,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: v_writelane_b32 v3, s30, 0 -; GFX7-NEXT: v_writelane_b32 v3, s31, 1 +; GFX7-NEXT: v_writelane_b32 v4, s30, 0 +; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 2, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readlane_b32 s31, v3, 1 -; GFX7-NEXT: v_readlane_b32 s30, v3, 0 +; GFX7-NEXT: v_readlane_b32 s31, v4, 1 +; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s8 @@ -3737,12 +3737,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v4, s30, 0 -; GCN-NEXT: v_writelane_b32 v4, s31, 1 +; GCN-NEXT: v_writelane_b32 v5, s30, 0 +; GCN-NEXT: v_writelane_b32 v5, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -3751,16 +3751,16 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v4, 1 -; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: v_readlane_b32 s31, v5, 1 +; GCN-NEXT: v_readlane_b32 s30, v5, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -3808,26 +3808,26 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: v_writelane_b32 v3, s30, 0 -; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: v_writelane_b32 v4, s30, 0 +; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v2 -; GFX8-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readlane_b32 s31, v3, 1 -; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: v_readlane_b32 s31, v4, 1 +; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s6 @@ -3941,12 +3941,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v5, s30, 0 -; GCN-NEXT: v_writelane_b32 v5, s31, 1 +; GCN-NEXT: v_writelane_b32 v8, s30, 0 +; GCN-NEXT: v_writelane_b32 v8, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -3957,21 +3957,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 6, v4 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v4 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v4 -; GCN-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4 +; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v5, 1 -; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: v_readlane_b32 s31, v8, 1 +; GCN-NEXT: v_readlane_b32 s30, v8, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -3984,21 +3984,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: v_writelane_b32 v5, s30, 0 -; GFX7-NEXT: v_writelane_b32 v5, s31, 1 +; GFX7-NEXT: v_writelane_b32 v6, s30, 0 +; GFX7-NEXT: v_writelane_b32 v6, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 6, v4 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4010,10 +4010,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readlane_b32 s31, v5, 1 -; GFX7-NEXT: v_readlane_b32 s30, v5, 0 +; GFX7-NEXT: v_readlane_b32 s31, v6, 1 +; GFX7-NEXT: v_readlane_b32 s30, v6, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s8 @@ -4026,26 +4026,26 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: v_writelane_b32 v3, s30, 0 -; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: v_writelane_b32 v4, s30, 0 +; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v2 -; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readlane_b32 s31, v3, 1 -; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: v_readlane_b32 s31, v4, 1 +; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s6 @@ -4157,12 +4157,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v9, s30, 0 -; GCN-NEXT: v_writelane_b32 v9, s31, 1 +; GCN-NEXT: v_writelane_b32 v16, s30, 0 +; GCN-NEXT: v_writelane_b32 v16, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -4177,33 +4177,33 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 14, v8 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 12, v8 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v8 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 8, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 6, v8 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v8 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 2, v8 -; GCN-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8 +; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v9, 1 -; GCN-NEXT: v_readlane_b32 s30, v9, 0 +; GCN-NEXT: v_readlane_b32 s31, v16, 1 +; GCN-NEXT: v_readlane_b32 s30, v16, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -4216,21 +4216,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: v_writelane_b32 v9, s30, 0 -; GFX7-NEXT: v_writelane_b32 v9, s31, 1 +; GFX7-NEXT: v_writelane_b32 v10, s30, 0 +; GFX7-NEXT: v_writelane_b32 v10, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 14, v8 +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -4258,10 +4258,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readlane_b32 s31, v9, 1 -; GFX7-NEXT: v_readlane_b32 s30, v9, 0 +; GFX7-NEXT: v_readlane_b32 s31, v10, 1 +; GFX7-NEXT: v_readlane_b32 s30, v10, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s8 @@ -4274,19 +4274,19 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: v_writelane_b32 v5, s30, 0 -; GFX8-NEXT: v_writelane_b32 v5, s31, 1 +; GFX8-NEXT: v_writelane_b32 v6, s30, 0 +; GFX8-NEXT: v_writelane_b32 v6, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 12, v4 -; GFX8-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4 +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -4296,10 +4296,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readlane_b32 s31, v5, 1 -; GFX8-NEXT: v_readlane_b32 s30, v5, 0 +; GFX8-NEXT: v_readlane_b32 s31, v6, 1 +; GFX8-NEXT: v_readlane_b32 s30, v6, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s6 @@ -4419,12 +4419,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v17, s30, 0 -; GCN-NEXT: v_writelane_b32 v17, s31, 1 +; GCN-NEXT: v_writelane_b32 v28, s30, 0 +; GCN-NEXT: v_writelane_b32 v28, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -4447,57 +4447,57 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 30, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 26, v16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 24, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 22, v16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 20, v16 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 18, v16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 16, v16 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 14, v16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 12, v16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 10, v16 -; GCN-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 22, v16 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 18, v16 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 14, v16 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 10, v16 +; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: v_add_i32_e32 v15, vcc, 8, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 6, v16 -; GCN-NEXT: buffer_store_short v14, v19, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 +; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 2, v16 -; GCN-NEXT: buffer_store_short v13, v20, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v18, vcc, 2, v16 +; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v12, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v11, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v11, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v10, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v10, v22, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v9, v23, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v24, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v25, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v26, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v27, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v4, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v17, 1 -; GCN-NEXT: v_readlane_b32 s30, v17, 0 +; GCN-NEXT: v_readlane_b32 s31, v28, 1 +; GCN-NEXT: v_readlane_b32 s30, v28, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -4510,21 +4510,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[4:5] ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX7-NEXT: v_writelane_b32 v17, s30, 0 -; GFX7-NEXT: v_writelane_b32 v17, s31, 1 +; GFX7-NEXT: v_writelane_b32 v18, s30, 0 +; GFX7-NEXT: v_writelane_b32 v18, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 30, v16 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 @@ -4584,10 +4584,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readlane_b32 s31, v17, 1 -; GFX7-NEXT: v_readlane_b32 s30, v17, 0 +; GFX7-NEXT: v_readlane_b32 s31, v18, 1 +; GFX7-NEXT: v_readlane_b32 s30, v18, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s8 @@ -4600,19 +4600,19 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: s_getpc_b64 s[4:5] ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8-NEXT: v_writelane_b32 v9, s30, 0 -; GFX8-NEXT: v_writelane_b32 v9, s31, 1 +; GFX8-NEXT: v_writelane_b32 v10, s30, 0 +; GFX8-NEXT: v_writelane_b32 v10, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 28, v8 -; GFX8-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8 +; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8 ; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen @@ -4634,10 +4634,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readlane_b32 s31, v9, 1 -; GFX8-NEXT: v_readlane_b32 s30, v9, 0 +; GFX8-NEXT: v_readlane_b32 s31, v10, 1 +; GFX8-NEXT: v_readlane_b32 s30, v10, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s6 @@ -27303,7 +27303,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -27330,16 +27330,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_writelane_b32 v32, s30, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 +; GFX7-NEXT: v_writelane_b32 v32, s31, 1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX7-NEXT: v_writelane_b32 v31, s34, 2 +; GFX7-NEXT: v_writelane_b32 v32, s34, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX7-NEXT: v_writelane_b32 v31, s35, 3 +; GFX7-NEXT: v_writelane_b32 v32, s35, 3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 @@ -27357,7 +27357,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; GFX7-NEXT: s_waitcnt vmcnt(14) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] @@ -27388,7 +27388,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[30:31] ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v15, v32, v15, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e64 v15, v31, v15, s[34:35] ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -27404,12 +27404,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_readlane_b32 s35, v31, 3 -; GFX7-NEXT: v_readlane_b32 s34, v31, 2 -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: v_readlane_b32 s35, v32, 3 +; GFX7-NEXT: v_readlane_b32 s34, v32, 2 +; GFX7-NEXT: v_readlane_b32 s31, v32, 1 +; GFX7-NEXT: v_readlane_b32 s30, v32, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -28211,107 +28211,107 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 -; GFX8-NEXT: v_writelane_b32 v31, s34, 2 +; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_writelane_b32 v34, s34, 2 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_writelane_b32 v31, s35, 3 +; GFX8-NEXT: v_writelane_b32 v34, s35, 3 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_writelane_b32 v31, s36, 4 +; GFX8-NEXT: v_writelane_b32 v34, s36, 4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_writelane_b32 v31, s37, 5 +; GFX8-NEXT: v_writelane_b32 v34, s37, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_writelane_b32 v31, s38, 6 +; GFX8-NEXT: v_writelane_b32 v34, s38, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_writelane_b32 v31, s39, 7 +; GFX8-NEXT: v_writelane_b32 v34, s39, 7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX8-NEXT: v_writelane_b32 v31, s40, 8 +; GFX8-NEXT: v_writelane_b32 v34, s40, 8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_writelane_b32 v31, s41, 9 +; GFX8-NEXT: v_writelane_b32 v34, s41, 9 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX8-NEXT: v_writelane_b32 v31, s42, 10 +; GFX8-NEXT: v_writelane_b32 v34, s42, 10 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX8-NEXT: v_writelane_b32 v31, s43, 11 +; GFX8-NEXT: v_writelane_b32 v34, s43, 11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX8-NEXT: v_writelane_b32 v31, s44, 12 +; GFX8-NEXT: v_writelane_b32 v34, s44, 12 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX8-NEXT: v_writelane_b32 v31, s45, 13 +; GFX8-NEXT: v_writelane_b32 v34, s45, 13 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX8-NEXT: v_writelane_b32 v31, s46, 14 +; GFX8-NEXT: v_writelane_b32 v34, s46, 14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX8-NEXT: v_writelane_b32 v31, s47, 15 +; GFX8-NEXT: v_writelane_b32 v34, s47, 15 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX8-NEXT: v_writelane_b32 v31, s48, 16 +; GFX8-NEXT: v_writelane_b32 v34, s48, 16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_writelane_b32 v31, s49, 17 +; GFX8-NEXT: v_writelane_b32 v34, s49, 17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_writelane_b32 v31, s50, 18 +; GFX8-NEXT: v_writelane_b32 v34, s50, 18 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX8-NEXT: v_writelane_b32 v31, s51, 19 +; GFX8-NEXT: v_writelane_b32 v34, s51, 19 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX8-NEXT: v_writelane_b32 v31, s52, 20 +; GFX8-NEXT: v_writelane_b32 v34, s52, 20 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX8-NEXT: v_writelane_b32 v31, s53, 21 +; GFX8-NEXT: v_writelane_b32 v34, s53, 21 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX8-NEXT: v_writelane_b32 v31, s54, 22 +; GFX8-NEXT: v_writelane_b32 v34, s54, 22 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX8-NEXT: v_writelane_b32 v31, s55, 23 +; GFX8-NEXT: v_writelane_b32 v34, s55, 23 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX8-NEXT: v_writelane_b32 v31, s56, 24 +; GFX8-NEXT: v_writelane_b32 v34, s56, 24 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX8-NEXT: v_writelane_b32 v31, s57, 25 +; GFX8-NEXT: v_writelane_b32 v34, s57, 25 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX8-NEXT: v_writelane_b32 v31, s58, 26 +; GFX8-NEXT: v_writelane_b32 v34, s58, 26 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX8-NEXT: v_writelane_b32 v31, s59, 27 +; GFX8-NEXT: v_writelane_b32 v34, s59, 27 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX8-NEXT: v_writelane_b32 v31, s60, 28 +; GFX8-NEXT: v_writelane_b32 v34, s60, 28 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX8-NEXT: v_writelane_b32 v31, s61, 29 +; GFX8-NEXT: v_writelane_b32 v34, s61, 29 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX8-NEXT: v_writelane_b32 v31, s62, 30 +; GFX8-NEXT: v_writelane_b32 v34, s62, 30 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX8-NEXT: v_writelane_b32 v31, s63, 31 +; GFX8-NEXT: v_writelane_b32 v34, s63, 31 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX8-NEXT: v_writelane_b32 v31, s64, 32 +; GFX8-NEXT: v_writelane_b32 v34, s64, 32 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX8-NEXT: v_writelane_b32 v31, s65, 33 +; GFX8-NEXT: v_writelane_b32 v34, s65, 33 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s66, 34 -; GFX8-NEXT: v_writelane_b32 v31, s67, 35 +; GFX8-NEXT: v_writelane_b32 v34, s66, 34 +; GFX8-NEXT: v_writelane_b32 v34, s67, 35 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 @@ -28344,74 +28344,74 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 ; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 ; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[66:67] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v33, s[64:65] -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v33, v34, v33, s[62:63] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[60:61] -; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v34, v32, s[58:59] +; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65] +; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61] +; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59] ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] ; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v27, v34, v27, s[54:55] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55] ; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] ; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v25, v34, v25, s[50:51] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51] ; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] ; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v34, v23, s[46:47] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47] ; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] ; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v21, v34, v21, s[42:43] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43] ; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v34, v19, s[38:39] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39] ; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v34, v17, s[34:35] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v34, v15, s[28:29] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v34, v13, s[24:25] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v34, v11, s[20:21] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v34, v9, s[16:17] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v34, v7, s[12:13] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v34, v5, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v34, v3, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28434,8 +28434,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28445,44 +28445,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s67, v31, 35 -; GFX8-NEXT: v_readlane_b32 s66, v31, 34 -; GFX8-NEXT: v_readlane_b32 s65, v31, 33 -; GFX8-NEXT: v_readlane_b32 s64, v31, 32 -; GFX8-NEXT: v_readlane_b32 s63, v31, 31 -; GFX8-NEXT: v_readlane_b32 s62, v31, 30 -; GFX8-NEXT: v_readlane_b32 s61, v31, 29 -; GFX8-NEXT: v_readlane_b32 s60, v31, 28 -; GFX8-NEXT: v_readlane_b32 s59, v31, 27 -; GFX8-NEXT: v_readlane_b32 s58, v31, 26 -; GFX8-NEXT: v_readlane_b32 s57, v31, 25 -; GFX8-NEXT: v_readlane_b32 s56, v31, 24 -; GFX8-NEXT: v_readlane_b32 s55, v31, 23 -; GFX8-NEXT: v_readlane_b32 s54, v31, 22 -; GFX8-NEXT: v_readlane_b32 s53, v31, 21 -; GFX8-NEXT: v_readlane_b32 s52, v31, 20 -; GFX8-NEXT: v_readlane_b32 s51, v31, 19 -; GFX8-NEXT: v_readlane_b32 s50, v31, 18 -; GFX8-NEXT: v_readlane_b32 s49, v31, 17 -; GFX8-NEXT: v_readlane_b32 s48, v31, 16 -; GFX8-NEXT: v_readlane_b32 s47, v31, 15 -; GFX8-NEXT: v_readlane_b32 s46, v31, 14 -; GFX8-NEXT: v_readlane_b32 s45, v31, 13 -; GFX8-NEXT: v_readlane_b32 s44, v31, 12 -; GFX8-NEXT: v_readlane_b32 s43, v31, 11 -; GFX8-NEXT: v_readlane_b32 s42, v31, 10 -; GFX8-NEXT: v_readlane_b32 s41, v31, 9 -; GFX8-NEXT: v_readlane_b32 s40, v31, 8 -; GFX8-NEXT: v_readlane_b32 s39, v31, 7 -; GFX8-NEXT: v_readlane_b32 s38, v31, 6 -; GFX8-NEXT: v_readlane_b32 s37, v31, 5 -; GFX8-NEXT: v_readlane_b32 s36, v31, 4 -; GFX8-NEXT: v_readlane_b32 s35, v31, 3 -; GFX8-NEXT: v_readlane_b32 s34, v31, 2 -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 +; GFX8-NEXT: v_readlane_b32 s67, v34, 35 +; GFX8-NEXT: v_readlane_b32 s66, v34, 34 +; GFX8-NEXT: v_readlane_b32 s65, v34, 33 +; GFX8-NEXT: v_readlane_b32 s64, v34, 32 +; GFX8-NEXT: v_readlane_b32 s63, v34, 31 +; GFX8-NEXT: v_readlane_b32 s62, v34, 30 +; GFX8-NEXT: v_readlane_b32 s61, v34, 29 +; GFX8-NEXT: v_readlane_b32 s60, v34, 28 +; GFX8-NEXT: v_readlane_b32 s59, v34, 27 +; GFX8-NEXT: v_readlane_b32 s58, v34, 26 +; GFX8-NEXT: v_readlane_b32 s57, v34, 25 +; GFX8-NEXT: v_readlane_b32 s56, v34, 24 +; GFX8-NEXT: v_readlane_b32 s55, v34, 23 +; GFX8-NEXT: v_readlane_b32 s54, v34, 22 +; GFX8-NEXT: v_readlane_b32 s53, v34, 21 +; GFX8-NEXT: v_readlane_b32 s52, v34, 20 +; GFX8-NEXT: v_readlane_b32 s51, v34, 19 +; GFX8-NEXT: v_readlane_b32 s50, v34, 18 +; GFX8-NEXT: v_readlane_b32 s49, v34, 17 +; GFX8-NEXT: v_readlane_b32 s48, v34, 16 +; GFX8-NEXT: v_readlane_b32 s47, v34, 15 +; GFX8-NEXT: v_readlane_b32 s46, v34, 14 +; GFX8-NEXT: v_readlane_b32 s45, v34, 13 +; GFX8-NEXT: v_readlane_b32 s44, v34, 12 +; GFX8-NEXT: v_readlane_b32 s43, v34, 11 +; GFX8-NEXT: v_readlane_b32 s42, v34, 10 +; GFX8-NEXT: v_readlane_b32 s41, v34, 9 +; GFX8-NEXT: v_readlane_b32 s40, v34, 8 +; GFX8-NEXT: v_readlane_b32 s39, v34, 7 +; GFX8-NEXT: v_readlane_b32 s38, v34, 6 +; GFX8-NEXT: v_readlane_b32 s37, v34, 5 +; GFX8-NEXT: v_readlane_b32 s36, v34, 4 +; GFX8-NEXT: v_readlane_b32 s35, v34, 3 +; GFX8-NEXT: v_readlane_b32 s34, v34, 2 +; GFX8-NEXT: v_readlane_b32 s31, v34, 1 +; GFX8-NEXT: v_readlane_b32 s30, v34, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -28491,104 +28491,104 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v31, s30, 0 -; GFX9-NEXT: v_writelane_b32 v31, s31, 1 -; GFX9-NEXT: v_writelane_b32 v31, s34, 2 +; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v33, s34, 2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_writelane_b32 v31, s35, 3 +; GFX9-NEXT: v_writelane_b32 v33, s35, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_writelane_b32 v31, s36, 4 +; GFX9-NEXT: v_writelane_b32 v33, s36, 4 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_writelane_b32 v31, s37, 5 +; GFX9-NEXT: v_writelane_b32 v33, s37, 5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_writelane_b32 v31, s38, 6 +; GFX9-NEXT: v_writelane_b32 v33, s38, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_writelane_b32 v31, s39, 7 +; GFX9-NEXT: v_writelane_b32 v33, s39, 7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_writelane_b32 v31, s40, 8 +; GFX9-NEXT: v_writelane_b32 v33, s40, 8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_writelane_b32 v31, s41, 9 +; GFX9-NEXT: v_writelane_b32 v33, s41, 9 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_writelane_b32 v31, s42, 10 +; GFX9-NEXT: v_writelane_b32 v33, s42, 10 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_writelane_b32 v31, s43, 11 +; GFX9-NEXT: v_writelane_b32 v33, s43, 11 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_writelane_b32 v31, s44, 12 +; GFX9-NEXT: v_writelane_b32 v33, s44, 12 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_writelane_b32 v31, s45, 13 +; GFX9-NEXT: v_writelane_b32 v33, s45, 13 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_writelane_b32 v31, s46, 14 +; GFX9-NEXT: v_writelane_b32 v33, s46, 14 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_writelane_b32 v31, s47, 15 +; GFX9-NEXT: v_writelane_b32 v33, s47, 15 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_writelane_b32 v31, s48, 16 +; GFX9-NEXT: v_writelane_b32 v33, s48, 16 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_writelane_b32 v31, s49, 17 +; GFX9-NEXT: v_writelane_b32 v33, s49, 17 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_writelane_b32 v31, s50, 18 +; GFX9-NEXT: v_writelane_b32 v33, s50, 18 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_writelane_b32 v31, s51, 19 +; GFX9-NEXT: v_writelane_b32 v33, s51, 19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_writelane_b32 v31, s52, 20 +; GFX9-NEXT: v_writelane_b32 v33, s52, 20 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_writelane_b32 v31, s53, 21 +; GFX9-NEXT: v_writelane_b32 v33, s53, 21 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_writelane_b32 v31, s54, 22 +; GFX9-NEXT: v_writelane_b32 v33, s54, 22 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_writelane_b32 v31, s55, 23 +; GFX9-NEXT: v_writelane_b32 v33, s55, 23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_writelane_b32 v31, s56, 24 +; GFX9-NEXT: v_writelane_b32 v33, s56, 24 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_writelane_b32 v31, s57, 25 +; GFX9-NEXT: v_writelane_b32 v33, s57, 25 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_writelane_b32 v31, s58, 26 +; GFX9-NEXT: v_writelane_b32 v33, s58, 26 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_writelane_b32 v31, s59, 27 +; GFX9-NEXT: v_writelane_b32 v33, s59, 27 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_writelane_b32 v31, s60, 28 +; GFX9-NEXT: v_writelane_b32 v33, s60, 28 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_writelane_b32 v31, s61, 29 +; GFX9-NEXT: v_writelane_b32 v33, s61, 29 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_writelane_b32 v31, s62, 30 +; GFX9-NEXT: v_writelane_b32 v33, s62, 30 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_writelane_b32 v31, s63, 31 +; GFX9-NEXT: v_writelane_b32 v33, s63, 31 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v31, s64, 32 -; GFX9-NEXT: v_writelane_b32 v31, s65, 33 -; GFX9-NEXT: v_writelane_b32 v31, s66, 34 +; GFX9-NEXT: v_writelane_b32 v33, s64, 32 +; GFX9-NEXT: v_writelane_b32 v33, s65, 33 +; GFX9-NEXT: v_writelane_b32 v33, s66, 34 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v31, s67, 35 +; GFX9-NEXT: v_writelane_b32 v33, s67, 35 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 @@ -28625,14 +28625,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v29, v32, v33, s[66:67] -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67] ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_cndmask_b32_e64 v32, v32, v33, s[64:65] -; GFX9-NEXT: v_cndmask_b32_e64 v33, v28, v30, s[62:63] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65] +; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63] ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61] @@ -28707,46 +28707,46 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 ; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 ; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v28, v33, s4 -; GFX9-NEXT: v_perm_b32 v15, v32, v29, s4 -; GFX9-NEXT: v_readlane_b32 s67, v31, 35 -; GFX9-NEXT: v_readlane_b32 s66, v31, 34 -; GFX9-NEXT: v_readlane_b32 s65, v31, 33 -; GFX9-NEXT: v_readlane_b32 s64, v31, 32 -; GFX9-NEXT: v_readlane_b32 s63, v31, 31 -; GFX9-NEXT: v_readlane_b32 s62, v31, 30 -; GFX9-NEXT: v_readlane_b32 s61, v31, 29 -; GFX9-NEXT: v_readlane_b32 s60, v31, 28 -; GFX9-NEXT: v_readlane_b32 s59, v31, 27 -; GFX9-NEXT: v_readlane_b32 s58, v31, 26 -; GFX9-NEXT: v_readlane_b32 s57, v31, 25 -; GFX9-NEXT: v_readlane_b32 s56, v31, 24 -; GFX9-NEXT: v_readlane_b32 s55, v31, 23 -; GFX9-NEXT: v_readlane_b32 s54, v31, 22 -; GFX9-NEXT: v_readlane_b32 s53, v31, 21 -; GFX9-NEXT: v_readlane_b32 s52, v31, 20 -; GFX9-NEXT: v_readlane_b32 s51, v31, 19 -; GFX9-NEXT: v_readlane_b32 s50, v31, 18 -; GFX9-NEXT: v_readlane_b32 s49, v31, 17 -; GFX9-NEXT: v_readlane_b32 s48, v31, 16 -; GFX9-NEXT: v_readlane_b32 s47, v31, 15 -; GFX9-NEXT: v_readlane_b32 s46, v31, 14 -; GFX9-NEXT: v_readlane_b32 s45, v31, 13 -; GFX9-NEXT: v_readlane_b32 s44, v31, 12 -; GFX9-NEXT: v_readlane_b32 s43, v31, 11 -; GFX9-NEXT: v_readlane_b32 s42, v31, 10 -; GFX9-NEXT: v_readlane_b32 s41, v31, 9 -; GFX9-NEXT: v_readlane_b32 s40, v31, 8 -; GFX9-NEXT: v_readlane_b32 s39, v31, 7 -; GFX9-NEXT: v_readlane_b32 s38, v31, 6 -; GFX9-NEXT: v_readlane_b32 s37, v31, 5 -; GFX9-NEXT: v_readlane_b32 s36, v31, 4 -; GFX9-NEXT: v_readlane_b32 s35, v31, 3 -; GFX9-NEXT: v_readlane_b32 s34, v31, 2 -; GFX9-NEXT: v_readlane_b32 s31, v31, 1 -; GFX9-NEXT: v_readlane_b32 s30, v31, 0 +; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 +; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 +; GFX9-NEXT: v_readlane_b32 s67, v33, 35 +; GFX9-NEXT: v_readlane_b32 s66, v33, 34 +; GFX9-NEXT: v_readlane_b32 s65, v33, 33 +; GFX9-NEXT: v_readlane_b32 s64, v33, 32 +; GFX9-NEXT: v_readlane_b32 s63, v33, 31 +; GFX9-NEXT: v_readlane_b32 s62, v33, 30 +; GFX9-NEXT: v_readlane_b32 s61, v33, 29 +; GFX9-NEXT: v_readlane_b32 s60, v33, 28 +; GFX9-NEXT: v_readlane_b32 s59, v33, 27 +; GFX9-NEXT: v_readlane_b32 s58, v33, 26 +; GFX9-NEXT: v_readlane_b32 s57, v33, 25 +; GFX9-NEXT: v_readlane_b32 s56, v33, 24 +; GFX9-NEXT: v_readlane_b32 s55, v33, 23 +; GFX9-NEXT: v_readlane_b32 s54, v33, 22 +; GFX9-NEXT: v_readlane_b32 s53, v33, 21 +; GFX9-NEXT: v_readlane_b32 s52, v33, 20 +; GFX9-NEXT: v_readlane_b32 s51, v33, 19 +; GFX9-NEXT: v_readlane_b32 s50, v33, 18 +; GFX9-NEXT: v_readlane_b32 s49, v33, 17 +; GFX9-NEXT: v_readlane_b32 s48, v33, 16 +; GFX9-NEXT: v_readlane_b32 s47, v33, 15 +; GFX9-NEXT: v_readlane_b32 s46, v33, 14 +; GFX9-NEXT: v_readlane_b32 s45, v33, 13 +; GFX9-NEXT: v_readlane_b32 s44, v33, 12 +; GFX9-NEXT: v_readlane_b32 s43, v33, 11 +; GFX9-NEXT: v_readlane_b32 s42, v33, 10 +; GFX9-NEXT: v_readlane_b32 s41, v33, 9 +; GFX9-NEXT: v_readlane_b32 s40, v33, 8 +; GFX9-NEXT: v_readlane_b32 s39, v33, 7 +; GFX9-NEXT: v_readlane_b32 s38, v33, 6 +; GFX9-NEXT: v_readlane_b32 s37, v33, 5 +; GFX9-NEXT: v_readlane_b32 s36, v33, 4 +; GFX9-NEXT: v_readlane_b32 s35, v33, 3 +; GFX9-NEXT: v_readlane_b32 s34, v33, 2 +; GFX9-NEXT: v_readlane_b32 s31, v33, 1 +; GFX9-NEXT: v_readlane_b32 s30, v33, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -28754,8 +28754,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_and_b32_e32 v29, 1, v29 @@ -28770,31 +28770,31 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 ; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX10-NEXT: s_clause 0x14 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_ushort v34, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28 @@ -28816,17 +28816,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 ; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12 ; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v31, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_writelane_b32 v31, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_writelane_b32 v31, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 @@ -28845,7 +28845,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0 -; GFX10-NEXT: v_writelane_b32 v31, s35, 3 +; GFX10-NEXT: v_writelane_b32 v40, s35, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27 ; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25 ; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23 @@ -28861,111 +28861,111 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9 ; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32 ; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_and_b32_e32 v2, 1, v34 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v33 ; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v35 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34 ; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_cndmask_b32_e64 v15, v35, v36, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v33, v32, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5 ; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_cndmask_b32_e64 v19, v38, v39, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v48 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_cndmask_b32_e64 v13, v48, v49, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v49 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36 ; GFX10-NEXT: s_waitcnt vmcnt(18) -; GFX10-NEXT: v_cndmask_b32_e64 v27, v53, v54, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10 ; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v55 +; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54 ; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_cndmask_b32_e64 v21, v55, v64, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9 ; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_cndmask_b32_e64 v11, v65, v37, s8 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v65 -; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v64 -; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v54 -; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v53 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v51, v52, s11 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v52 -; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v51 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64 +; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53 +; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52 +; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11 +; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51 +; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50 ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cndmask_b32_e64 v37, v30, v50, s12 -; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v50 +; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 ; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v29, v69, s13 -; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v69 +; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68 ; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_cndmask_b32_e64 v50, v24, v22, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15 ; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cndmask_b32_e64 v51, v68, v20, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16 ; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v68 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_cndmask_b32_e64 v53, v67, v18, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17 ; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e64 v49, v28, v26, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14 ; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v67 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v66, v16, s18 +; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66 +; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v66 +; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v65, v14, v12, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v66, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v67, v6, v5, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v8, v7, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v69, v10, v9, s22 +; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21 +; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v33, v32, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v36, v35, s25 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v38, s26 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v48, s27 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v52, v20, s29 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v16, s30 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v54, v18, s34 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4 -; GFX10-NEXT: v_perm_b32 v0, v0, v65, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v1, v55, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v2, v53, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v20, v51, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v12, v50, 0x5040100 -; GFX10-NEXT: v_perm_b32 v5, v5, v49, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v6, v39, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v7, v37, 0x5040100 -; GFX10-NEXT: v_perm_b32 v8, v8, v34, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100 +; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100 ; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100 ; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v69, v11, 0x5040100 -; GFX10-NEXT: v_perm_b32 v12, v68, v19, 0x5040100 -; GFX10-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 -; GFX10-NEXT: v_perm_b32 v14, v66, v17, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100 +; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100 +; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 +; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100 ; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 -; GFX10-NEXT: v_readlane_b32 s35, v31, 3 -; GFX10-NEXT: v_readlane_b32 s34, v31, 2 -; GFX10-NEXT: v_readlane_b32 s31, v31, 1 -; GFX10-NEXT: v_readlane_b32 s30, v31, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s35, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 634106d20489e..e926a3c728cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -278,11 +278,11 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v0 +; GCN: v_writelane_b32 v1 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v0 +; GCN: v_writelane_b32 v1 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 @@ -320,19 +320,19 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-61: v_writelane_b32 v0, +; GCN-COUNT-61: v_writelane_b32 v1, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v0, +; GCN: v_writelane_b32 v1, ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v0, +; GCN: v_writelane_b32 v1, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0 +; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 36a7ed51227a6..764c40ebc714d 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -19,25 +19,25 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 16 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: v_writelane_b32 v41, s16, 16 +; CHECK-NEXT: v_writelane_b32 v41, s30, 0 +; CHECK-NEXT: v_writelane_b32 v41, s31, 1 +; CHECK-NEXT: v_writelane_b32 v41, s34, 2 +; CHECK-NEXT: v_writelane_b32 v41, s35, 3 +; CHECK-NEXT: v_writelane_b32 v41, s36, 4 +; CHECK-NEXT: v_writelane_b32 v41, s37, 5 +; CHECK-NEXT: v_writelane_b32 v41, s38, 6 +; CHECK-NEXT: v_writelane_b32 v41, s39, 7 +; CHECK-NEXT: v_writelane_b32 v41, s40, 8 +; CHECK-NEXT: v_writelane_b32 v41, s41, 9 +; CHECK-NEXT: v_writelane_b32 v41, s42, 10 +; CHECK-NEXT: v_writelane_b32 v41, s43, 11 +; CHECK-NEXT: v_writelane_b32 v41, s44, 12 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_writelane_b32 v40, s46, 14 +; CHECK-NEXT: v_writelane_b32 v41, s45, 13 +; CHECK-NEXT: v_writelane_b32 v41, s46, 14 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: @@ -45,11 +45,11 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v40, s47, 15 +; CHECK-NEXT: v_writelane_b32 v41, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 @@ -67,33 +67,33 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s47, v40, 15 -; CHECK-NEXT: v_readlane_b32 s46, v40, 14 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 16 +; CHECK-NEXT: v_readlane_b32 s47, v41, 15 +; CHECK-NEXT: v_readlane_b32 s46, v41, 14 +; CHECK-NEXT: v_readlane_b32 s45, v41, 13 +; CHECK-NEXT: v_readlane_b32 s44, v41, 12 +; CHECK-NEXT: v_readlane_b32 s43, v41, 11 +; CHECK-NEXT: v_readlane_b32 s42, v41, 10 +; CHECK-NEXT: v_readlane_b32 s41, v41, 9 +; CHECK-NEXT: v_readlane_b32 s40, v41, 8 +; CHECK-NEXT: v_readlane_b32 s39, v41, 7 +; CHECK-NEXT: v_readlane_b32 s38, v41, 6 +; CHECK-NEXT: v_readlane_b32 s37, v41, 5 +; CHECK-NEXT: v_readlane_b32 s36, v41, 4 +; CHECK-NEXT: v_readlane_b32 s35, v41, 3 +; CHECK-NEXT: v_readlane_b32 s34, v41, 2 +; CHECK-NEXT: v_readlane_b32 s31, v41, 1 +; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: v_readlane_b32 s4, v41, 16 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 145ab4ae6378b..a118fa388f86d 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -3992,29 +3992,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_ubyte v0, v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_ubyte v0, v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: global_store_byte v[41:42], v0, off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_byte v[40:41], v0, off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4027,30 +4027,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_ubyte v0, v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ubyte v0, v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: global_store_byte v[41:42], v0, off +; GFX10-NEXT: global_store_byte v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4064,30 +4064,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_u8 v0, v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_u8 v0, v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: global_store_b8 v[41:42], v0, off +; GFX11-NEXT: global_store_b8 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4100,30 +4100,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_byte v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_byte v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4144,33 +4144,33 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_ushort v0, v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_ushort v0, v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: global_store_short v[41:42], v0, off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_short v[40:41], v0, off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4183,34 +4183,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_ushort v0, v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ushort v0, v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: global_store_short v[41:42], v0, off +; GFX10-NEXT: global_store_short v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4224,36 +4224,36 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_u16 v0, v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_u16 v0, v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: global_store_b16 v[41:42], v0, off +; GFX11-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4266,34 +4266,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b16 v1, 8, v0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: global_store_short v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4314,19 +4314,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_dword v0, v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_dword v0, v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4336,15 +4336,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_byte v[3:4], v2, off -; GFX9-NEXT: global_store_short v[41:42], v0, off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_short v[40:41], v0, off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4357,20 +4357,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_dword v0, v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dword v0, v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4378,17 +4378,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, 2 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: global_store_byte v[3:4], v2, off -; GFX10-NEXT: global_store_short v[41:42], v0, off +; GFX10-NEXT: global_store_short v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4402,20 +4402,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_b32 v0, v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_b32 v0, v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4425,18 +4425,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 2 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off -; GFX11-NEXT: global_store_b16 v[41:42], v3, off +; GFX11-NEXT: global_store_b16 v[40:41], v3, off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4449,20 +4449,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_dword v0, v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_dword v0, v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4470,17 +4470,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: global_store_byte v[3:4], v2, off -; GFX10-SCRATCH-NEXT: global_store_short v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4501,19 +4501,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_dword v0, v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_dword v0, v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 @@ -4524,15 +4524,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[41:42], v0, off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4545,20 +4545,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_dword v0, v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dword v0, v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4566,18 +4566,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v[41:42], v0, off +; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4591,20 +4591,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_b32 v0, v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_b32 v0, v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4615,22 +4615,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v[41:42], v0, off +; GFX11-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4643,20 +4643,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_dword v0, v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_dword v0, v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4664,18 +4664,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4696,19 +4696,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_dwordx2 v[5:6], v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v5 @@ -4724,15 +4724,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_byte v[0:1], v4, off -; GFX9-NEXT: global_store_dword v[41:42], v2, off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_dword v[40:41], v2, off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4745,20 +4745,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_dwordx2 v[5:6], v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6] ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v5 @@ -4768,21 +4768,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_byte v[0:1], v4, off -; GFX10-NEXT: global_store_dword v[41:42], v2, off +; GFX10-NEXT: global_store_dword v[40:41], v2, off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4796,20 +4796,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_b64 v[5:6], v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_b64 v[5:6], v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6] @@ -4822,10 +4822,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 @@ -4835,12 +4835,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v4, off -; GFX11-NEXT: global_store_b32 v[41:42], v2, off +; GFX11-NEXT: global_store_b32 v[40:41], v2, off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4853,20 +4853,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[5:6], v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[5:6], v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b64 v[3:4], 24, v[5:6] ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v5 @@ -4876,21 +4876,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v4, off -; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v2, off +; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v2, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4911,19 +4911,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[41:42], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[40:41], off +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4944,15 +4944,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v[41:42], v[3:4], off -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_dwordx2 v[40:41], v[3:4], off +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4965,20 +4965,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[41:42], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[40:41], off +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4993,21 +4993,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v7, 8, v7 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v[41:42], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5021,31 +5021,31 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 ; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_mov_b32_e32 v42, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: global_load_b64 v[0:1], v[41:42], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: global_load_b64 v[0:1], v[40:41], off +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 @@ -5056,23 +5056,23 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-NEXT: global_store_b64 v[41:42], v[0:1], off +; GFX11-NEXT: global_store_b64 v[40:41], v[0:1], off ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5085,20 +5085,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[41:42], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[40:41], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -5113,21 +5113,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v7, 8, v7 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: global_store_dwordx2 v[41:42], v[0:1], off +; GFX10-SCRATCH-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5148,24 +5148,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: v_mov_b32_e32 v42, 16 ; GFX9-NEXT: v_mov_b32_e32 v41, 0 -; GFX9-NEXT: v_mov_b32_e32 v43, 16 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 -; GFX9-NEXT: v_mov_b32_e32 v44, 0 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[41:42], off -; GFX9-NEXT: global_load_dwordx4 v[16:19], v[43:44], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[40:41], off +; GFX9-NEXT: global_load_dwordx4 v[16:19], v[42:43], off +; GFX9-NEXT: v_writelane_b32 v44, s34, 2 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v0 @@ -5245,18 +5245,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx4 v[43:44], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[41:42], v[6:9], off -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: global_store_dwordx4 v[42:43], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[40:41], v[6:9], off +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: v_readlane_b32 s34, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5269,26 +5269,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, 0 +; GFX10-NEXT: v_mov_b32_e32 v42, 16 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-NEXT: v_mov_b32_e32 v43, 16 -; GFX10-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-NEXT: v_mov_b32_e32 v44, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_mov_b32_e32 v43, 0 +; GFX10-NEXT: v_writelane_b32 v44, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[41:42], off -; GFX10-NEXT: global_load_dwordx4 v[16:19], v[43:44], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[40:41], off +; GFX10-NEXT: global_load_dwordx4 v[16:19], v[42:43], off +; GFX10-NEXT: v_writelane_b32 v44, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v44, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v0 @@ -5368,18 +5368,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-NEXT: v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx4 v[43:44], v[7:10], off -; GFX10-NEXT: global_store_dwordx4 v[41:42], v[3:6], off +; GFX10-NEXT: global_store_dwordx4 v[42:43], v[7:10], off +; GFX10-NEXT: global_store_dwordx4 v[40:41], v[3:6], off ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 +; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: v_readlane_b32 s34, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -5393,24 +5393,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 -; GFX11-NEXT: v_mov_b32_e32 v41, 0 -; GFX11-NEXT: v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v43, 16 -; GFX11-NEXT: v_mov_b32_e32 v44, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 +; GFX11-NEXT: v_mov_b32_e32 v40, 0 +; GFX11-NEXT: v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16 +; GFX11-NEXT: v_mov_b32_e32 v43, 0 +; GFX11-NEXT: v_writelane_b32 v44, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi -; GFX11-NEXT: global_load_b128 v[0:3], v[41:42], off +; GFX11-NEXT: global_load_b128 v[0:3], v[40:41], off ; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo -; GFX11-NEXT: global_load_b128 v[16:19], v[43:44], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: global_load_b128 v[16:19], v[42:43], off +; GFX11-NEXT: v_writelane_b32 v44, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v44, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v0 @@ -5519,18 +5519,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 ; GFX11-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[43:44], v[7:10], off -; GFX11-NEXT: global_store_b128 v[41:42], v[3:6], off +; GFX11-NEXT: global_store_b128 v[42:43], v[7:10], off +; GFX11-NEXT: global_store_b128 v[40:41], v[3:6], off ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v44, 1 +; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5543,26 +5543,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v44, s33 offset:16 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v44, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:12 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 16 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v43, 16 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v44, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v43, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s0, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[41:42], off -; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[16:19], v[43:44], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[40:41], off +; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[16:19], v[42:43], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v44, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v35, 8, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v36, 16, v0 @@ -5642,18 +5642,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: global_store_dwordx4 v[43:44], v[7:10], off -; GFX10-SCRATCH-NEXT: global_store_dwordx4 v[41:42], v[3:6], off +; GFX10-SCRATCH-NEXT: global_store_dwordx4 v[42:43], v[7:10], off +; GFX10-SCRATCH-NEXT: global_store_dwordx4 v[40:41], v[3:6], off ; GFX10-SCRATCH-NEXT: s_clause 0x3 -; GFX10-SCRATCH-NEXT: scratch_load_dword v44, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 @@ -8255,29 +8255,29 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 ; GFX9-NEXT: s_mov_b32 s35, external_i32_func_i32@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_i32_func_i32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: global_store_dword v[41:42], v0, off +; GFX9-NEXT: global_store_dword v[40:41], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8290,31 +8290,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: v_writelane_b32 v42, s34, 2 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_mov_b32 s35, external_i32_func_i32@abs32@hi -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 ; GFX10-NEXT: s_mov_b32 s34, external_i32_func_i32@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v42, v1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: global_store_dword v[41:42], v0, off +; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8328,30 +8328,30 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v42, s0, 2 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 -; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 ; GFX11-NEXT: s_mov_b32 s1, external_i32_func_i32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_i32_func_i32@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc +; GFX11-NEXT: global_store_b32 v[40:41], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8364,31 +8364,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_i32_func_i32@abs32@hi -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_i32_func_i32@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index ad55d49a1a96d..a14e3d5673f82 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -322,30 +322,30 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_void@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_void@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mov_b32_e32 v31, v41 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 1 +; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -358,31 +358,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_void@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_void@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v41, v31 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v40, v31 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_mov_b32_e32 v31, v41 +; GFX10-NEXT: v_mov_b32_e32 v31, v40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 1 +; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -396,31 +396,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 2 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_void@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_void@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_mov_b32_e32 v41, v31 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_mov_b32_e32 v40, v31 +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v31, v41 +; GFX11-NEXT: v_mov_b32_e32 v31, v40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v41, 1 +; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1184,16 +1184,16 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 3 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_void@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_void@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -1201,21 +1201,21 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v32 +; GFX9-NEXT: v_mov_b32_e32 v40, v32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v41 +; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 2 +; GFX9-NEXT: v_readlane_b32 s30, v41, 1 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1228,15 +1228,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v41, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_void@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_void@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s4, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND @@ -1244,23 +1244,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v41, v32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_mov_b32_e32 v40, v32 +; GFX10-NEXT: v_writelane_b32 v41, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v41 +; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 2 +; GFX10-NEXT: v_readlane_b32 s30, v41, 1 +; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v41, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1274,14 +1274,14 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v41, s0, 3 ; GFX11-NEXT: s_mov_b32 s1, external_void_func_void@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, external_void_func_void@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: v_writelane_b32 v41, s4, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND @@ -1289,23 +1289,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_mov_b32_e32 v41, v32 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: v_mov_b32_e32 v40, v32 +; GFX11-NEXT: v_writelane_b32 v41, s30, 1 +; GFX11-NEXT: v_writelane_b32 v41, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v41 +; GFX11-NEXT: ; use v40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v41, 2 +; GFX11-NEXT: v_readlane_b32 s30, v41, 1 +; GFX11-NEXT: v_readlane_b32 s4, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 23502d1b36d18..c1d682689903a 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2764,27 +2764,26 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: s_mov_b32 s36, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2827,7 +2826,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo ; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0 @@ -2862,41 +2861,41 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:648 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:652 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:656 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:660 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:664 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:668 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:672 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:676 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:680 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:684 -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:688 -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:692 -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:696 -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:700 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:704 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:708 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:712 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:716 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:720 -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:724 -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:728 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:732 -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:736 -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:740 -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:748 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:752 -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:756 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:760 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:764 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:768 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:648 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:652 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:656 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s33 offset:660 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s33 offset:664 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s33 offset:668 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:672 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s33 offset:676 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s33 offset:680 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s33 offset:684 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s33 offset:688 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s33 offset:692 +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:696 +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:700 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:704 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:708 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:712 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:716 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:720 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:724 +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:728 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:732 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:736 +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:740 +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:748 +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:752 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:756 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:760 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:764 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:768 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:772 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:776 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:780 @@ -2953,38 +2952,38 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; GFX9-NEXT: v_mov_b32_e32 v0, 24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 @@ -3003,26 +3002,25 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 @@ -3036,27 +3034,27 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 @@ -3133,7 +3131,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v31, 0 ; GFX10-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x28 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 @@ -3154,21 +3152,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s33 offset:696 ; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s33 offset:700 ; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s33 offset:704 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:708 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:712 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:716 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:720 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:724 -; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:728 -; GFX10-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:732 -; GFX10-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:736 -; GFX10-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:740 -; GFX10-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:748 -; GFX10-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:752 -; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:756 -; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:760 -; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:764 -; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:768 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:708 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:712 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:716 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:720 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:724 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:728 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:732 +; GFX10-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:736 +; GFX10-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:740 +; GFX10-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:748 +; GFX10-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:752 +; GFX10-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:756 +; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:760 +; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:764 +; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:768 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:772 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:776 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:780 @@ -3243,21 +3241,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 ; GFX10-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 @@ -3279,25 +3277,25 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0xe -; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 -; GFX10-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 -; GFX10-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 -; GFX10-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 -; GFX10-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 -; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 +; GFX10-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 +; GFX10-NEXT: v_readlane_b32 s31, v63, 1 +; GFX10-NEXT: v_readlane_b32 s30, v63, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 @@ -3313,7 +3311,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3323,22 +3321,19 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v61, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v62, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v63, s33 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v59, s33 ; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 ; GFX11-NEXT: s_add_i32 s1, s32, 0x90 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 @@ -3361,7 +3356,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v60, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0 @@ -3380,7 +3375,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 ; GFX11-NEXT: s_mov_b32 s45, return_72xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s44, return_72xi32@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v60, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 @@ -3391,22 +3386,16 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[37:40], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:736 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52 -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 @@ -3414,13 +3403,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_mov_b32_e32 v56, v63 -; GFX11-NEXT: v_mov_b32_e32 v12, v15 -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2 -; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19 +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3432,47 +3415,53 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 +; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51 -; GFX11-NEXT: v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41 -; GFX11-NEXT: v_mov_b32_e32 v50, v42 -; GFX11-NEXT: v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13 -; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9 +; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51 +; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53 +; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55 +; GFX11-NEXT: v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40 +; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56 +; GFX11-NEXT: v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44 +; GFX11-NEXT: v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12 +; GFX11-NEXT: v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59 +; GFX11-NEXT: v_mov_b32_e32 v58, v13 +; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 +; GFX11-NEXT: v_mov_b32_e32 v9, v20 ; GFX11-NEXT: scratch_store_b32 off, v11, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v51, v43 -; GFX11-NEXT: v_mov_b32_e32 v41, v59 +; GFX11-NEXT: v_mov_b32_e32 v11, v22 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 -; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 -; GFX11-NEXT: v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61 +; GFX11-NEXT: v_mov_b32_e32 v5, v16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 -; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20 +; GFX11-NEXT: v_mov_b32_e32 v0, 24 ; GFX11-NEXT: s_add_i32 s0, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v5, v16 +; GFX11-NEXT: v_mov_b32_e32 v6, v17 ; GFX11-NEXT: scratch_store_b128 off, v[12:15], s0 +; GFX11-NEXT: v_mov_b32_e32 v13, v24 ; GFX11-NEXT: s_add_i32 s0, s32, 0x6c -; GFX11-NEXT: v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22 +; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x50 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s0 +; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47 +; GFX11-NEXT: v_mov_b32_e32 v14, v25 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v15, v26 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v16, v27 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 ; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3485,26 +3474,23 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[44:45] -; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_load_b32 v63, off, s33 -; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v59, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:20 -; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:24 -; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:28 -; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:32 -; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:36 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:40 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:44 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:48 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:52 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:56 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b32 v59, off, s33 +; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:24 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:28 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:32 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:36 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:40 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 +; GFX11-NEXT: v_readlane_b32 s31, v60, 1 +; GFX11-NEXT: v_readlane_b32 s30, v60, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s46 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 65404637ca51b..76ec1cc84f55b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -152,23 +152,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -202,23 +202,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -830,23 +830,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -958,23 +958,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1416,23 +1416,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1466,23 +1466,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1766,23 +1766,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1816,23 +1816,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2116,23 +2116,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2166,23 +2166,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2649,38 +2649,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB44_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, s34, v2 -; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: v_not_b32_e32 v0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB44_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2740,38 +2740,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB45_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, s34, v2 -; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: v_not_b32_e32 v0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB45_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2834,39 +2834,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB46_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_and_b32_e32 v0, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_not_b32_e32 v3, v0 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_and_b32_e32 v0, s34, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB46_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2928,39 +2927,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB47_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_and_b32_e32 v0, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_not_b32_e32 v3, v0 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_and_b32_e32 v0, s34, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB47_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3167,23 +3165,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3217,23 +3215,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3517,23 +3515,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3567,23 +3565,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4038,37 +4036,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB68_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_max_i32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB68_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4126,37 +4124,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB69_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_max_i32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB69_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4217,38 +4215,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB70_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB70_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4308,38 +4305,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB71_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB71_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5135,37 +5131,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_max_u32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5223,37 +5219,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_max_u32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5314,38 +5310,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB82_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_u32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5405,38 +5400,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_u32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6137,37 +6131,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_min_u32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6225,37 +6219,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_min_u32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6316,38 +6310,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_u32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6407,38 +6400,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_u32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6814,37 +6806,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_min_i32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6902,37 +6894,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_min_i32_e32 v0, s34, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6993,38 +6985,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_i32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7084,38 +7075,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_i32_e32 v3, s34, v4 -; SI-NEXT: v_mov_b32_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7727,23 +7717,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7777,23 +7767,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8077,23 +8067,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8127,23 +8117,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v1, s6, 0 +; SI-NEXT: v_writelane_b32 v1, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v0, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v1, 1 +; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 34457781a9999..d137f471910dc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -154,25 +154,25 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -208,23 +208,23 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -846,25 +846,25 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -978,23 +978,23 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1446,25 +1446,25 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1500,23 +1500,23 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1810,25 +1810,25 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1864,23 +1864,23 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2174,25 +2174,25 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2228,23 +2228,23 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2772,44 +2772,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB44_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, s34, v4 +; SI-NEXT: v_and_b32_e32 v0, s34, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, s35, v3 -; SI-NEXT: v_not_b32_e32 v2, v1 -; SI-NEXT: v_not_b32_e32 v1, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_and_b32_e32 v4, s35, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_not_b32_e32 v0, v4 ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB44_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2877,44 +2877,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB45_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, s34, v4 +; SI-NEXT: v_and_b32_e32 v0, s34, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, s35, v3 -; SI-NEXT: v_not_b32_e32 v2, v1 -; SI-NEXT: v_not_b32_e32 v1, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_and_b32_e32 v4, s35, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_not_b32_e32 v0, v4 ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB45_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2983,46 +2983,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB46_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_and_b32_e32 v0, s34, v8 -; SI-NEXT: v_and_b32_e32 v1, s35, v7 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_not_b32_e32 v6, v0 -; SI-NEXT: v_not_b32_e32 v5, v1 +; SI-NEXT: v_and_b32_e32 v0, s34, v5 +; SI-NEXT: v_and_b32_e32 v1, s35, v4 +; SI-NEXT: v_not_b32_e32 v3, v0 +; SI-NEXT: v_not_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB46_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3090,46 +3088,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB47_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_and_b32_e32 v0, s34, v8 -; SI-NEXT: v_and_b32_e32 v1, s35, v7 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_not_b32_e32 v6, v0 -; SI-NEXT: v_not_b32_e32 v5, v1 +; SI-NEXT: v_and_b32_e32 v0, s34, v5 +; SI-NEXT: v_and_b32_e32 v1, s35, v4 +; SI-NEXT: v_not_b32_e32 v3, v0 +; SI-NEXT: v_not_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB47_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3344,25 +3340,25 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3398,23 +3394,23 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3708,25 +3704,25 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3762,23 +3758,23 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4294,45 +4290,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB68_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB68_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4402,45 +4398,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB69_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB69_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4511,47 +4507,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB70_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB70_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4621,47 +4615,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB71_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB71_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5576,45 +5568,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5684,45 +5676,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5793,47 +5785,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB82_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5903,47 +5893,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6746,45 +6734,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6854,45 +6842,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6963,47 +6951,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7073,47 +7059,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7552,45 +7536,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7660,45 +7644,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v5, s35 -; SI-NEXT: v_mov_b32_e32 v6, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4] -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v4 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v8, v2 ; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v4, v8 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7769,47 +7753,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7879,47 +7861,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v7, vcc -; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: v_mov_b32_e32 v4, v6 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8601,25 +8581,25 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8655,23 +8635,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8965,25 +8945,25 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9019,23 +8999,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v0, s6, 0 -; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_writelane_b32 v2, s6, 0 +; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index e456b7d2e8b9b..297b5180dfe9b 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v1, s30, 0 -; CHECK-NEXT: v_writelane_b32 v1, s31, 1 -; CHECK-NEXT: v_writelane_b32 v1, s36, 2 -; CHECK-NEXT: v_writelane_b32 v1, s37, 3 -; CHECK-NEXT: v_writelane_b32 v1, s38, 4 -; CHECK-NEXT: v_writelane_b32 v1, s39, 5 -; CHECK-NEXT: v_writelane_b32 v1, s40, 6 -; CHECK-NEXT: v_writelane_b32 v1, s41, 7 -; CHECK-NEXT: v_writelane_b32 v1, s42, 8 -; CHECK-NEXT: v_writelane_b32 v1, s43, 9 -; CHECK-NEXT: v_writelane_b32 v1, s44, 10 -; CHECK-NEXT: v_writelane_b32 v1, s45, 11 -; CHECK-NEXT: v_writelane_b32 v1, s46, 12 -; CHECK-NEXT: v_writelane_b32 v1, s47, 13 -; CHECK-NEXT: v_writelane_b32 v1, s48, 14 -; CHECK-NEXT: v_writelane_b32 v1, s49, 15 +; CHECK-NEXT: v_writelane_b32 v8, s30, 0 +; CHECK-NEXT: v_writelane_b32 v8, s31, 1 +; CHECK-NEXT: v_writelane_b32 v8, s36, 2 +; CHECK-NEXT: v_writelane_b32 v8, s37, 3 +; CHECK-NEXT: v_writelane_b32 v8, s38, 4 +; CHECK-NEXT: v_writelane_b32 v8, s39, 5 +; CHECK-NEXT: v_writelane_b32 v8, s40, 6 +; CHECK-NEXT: v_writelane_b32 v8, s41, 7 +; CHECK-NEXT: v_writelane_b32 v8, s42, 8 +; CHECK-NEXT: v_writelane_b32 v8, s43, 9 +; CHECK-NEXT: v_writelane_b32 v8, s44, 10 +; CHECK-NEXT: v_writelane_b32 v8, s45, 11 +; CHECK-NEXT: v_writelane_b32 v8, s46, 12 +; CHECK-NEXT: v_writelane_b32 v8, s47, 13 +; CHECK-NEXT: v_writelane_b32 v8, s48, 14 +; CHECK-NEXT: v_writelane_b32 v8, s49, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v1, s50, 16 +; CHECK-NEXT: v_writelane_b32 v8, s50, 16 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v1, s51, 17 +; CHECK-NEXT: v_writelane_b32 v8, s51, 17 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s4, 0x130 ; CHECK-NEXT: s_mov_b32 s5, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v5, s36, 0 -; CHECK-NEXT: v_writelane_b32 v5, s37, 1 -; CHECK-NEXT: v_writelane_b32 v5, s38, 2 -; CHECK-NEXT: v_writelane_b32 v5, s39, 3 -; CHECK-NEXT: v_writelane_b32 v5, s40, 4 -; CHECK-NEXT: v_writelane_b32 v5, s41, 5 -; CHECK-NEXT: v_writelane_b32 v5, s42, 6 -; CHECK-NEXT: v_writelane_b32 v5, s43, 7 -; CHECK-NEXT: v_writelane_b32 v5, s44, 8 -; CHECK-NEXT: v_writelane_b32 v5, s45, 9 -; CHECK-NEXT: v_writelane_b32 v5, s46, 10 +; CHECK-NEXT: v_writelane_b32 v4, s36, 0 +; CHECK-NEXT: v_writelane_b32 v4, s37, 1 +; CHECK-NEXT: v_writelane_b32 v4, s38, 2 +; CHECK-NEXT: v_writelane_b32 v4, s39, 3 +; CHECK-NEXT: v_writelane_b32 v4, s40, 4 +; CHECK-NEXT: v_writelane_b32 v4, s41, 5 +; CHECK-NEXT: v_writelane_b32 v4, s42, 6 +; CHECK-NEXT: v_writelane_b32 v4, s43, 7 +; CHECK-NEXT: v_writelane_b32 v4, s44, 8 +; CHECK-NEXT: v_writelane_b32 v4, s45, 9 +; CHECK-NEXT: v_writelane_b32 v4, s46, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s47, 11 -; CHECK-NEXT: v_writelane_b32 v5, s48, 12 -; CHECK-NEXT: v_writelane_b32 v5, s49, 13 +; CHECK-NEXT: v_writelane_b32 v4, s47, 11 +; CHECK-NEXT: v_writelane_b32 v4, s48, 12 +; CHECK-NEXT: v_writelane_b32 v4, s49, 13 ; CHECK-NEXT: s_mov_b32 s20, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_writelane_b32 v5, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v6, s28 -; CHECK-NEXT: v_mov_b32_e32 v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_writelane_b32 v4, s50, 14 +; CHECK-NEXT: v_mov_b32_e32 v5, s28 +; CHECK-NEXT: v_mov_b32_e32 v6, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v5, s51, 15 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v4, s51, 15 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v5, s4, 16 -; CHECK-NEXT: v_writelane_b32 v5, s5, 17 -; CHECK-NEXT: v_writelane_b32 v5, s6, 18 -; CHECK-NEXT: v_writelane_b32 v5, s7, 19 -; CHECK-NEXT: v_writelane_b32 v5, s8, 20 -; CHECK-NEXT: v_writelane_b32 v5, s9, 21 -; CHECK-NEXT: image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v5, s10, 22 -; CHECK-NEXT: v_writelane_b32 v5, s11, 23 -; CHECK-NEXT: v_writelane_b32 v5, s12, 24 -; CHECK-NEXT: v_writelane_b32 v5, s13, 25 -; CHECK-NEXT: v_writelane_b32 v5, s14, 26 -; CHECK-NEXT: v_writelane_b32 v5, s15, 27 -; CHECK-NEXT: v_writelane_b32 v5, s16, 28 -; CHECK-NEXT: v_writelane_b32 v1, s52, 18 -; CHECK-NEXT: v_writelane_b32 v5, s17, 29 -; CHECK-NEXT: v_writelane_b32 v1, s53, 19 -; CHECK-NEXT: v_writelane_b32 v5, s18, 30 -; CHECK-NEXT: v_writelane_b32 v1, s54, 20 -; CHECK-NEXT: v_writelane_b32 v5, s19, 31 +; CHECK-NEXT: v_writelane_b32 v4, s4, 16 +; CHECK-NEXT: v_writelane_b32 v4, s5, 17 +; CHECK-NEXT: v_writelane_b32 v4, s6, 18 +; CHECK-NEXT: v_writelane_b32 v4, s7, 19 +; CHECK-NEXT: v_writelane_b32 v4, s8, 20 +; CHECK-NEXT: v_writelane_b32 v4, s9, 21 +; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v4, s10, 22 +; CHECK-NEXT: v_writelane_b32 v4, s11, 23 +; CHECK-NEXT: v_writelane_b32 v4, s12, 24 +; CHECK-NEXT: v_writelane_b32 v4, s13, 25 +; CHECK-NEXT: v_writelane_b32 v4, s14, 26 +; CHECK-NEXT: v_writelane_b32 v4, s15, 27 +; CHECK-NEXT: v_writelane_b32 v4, s16, 28 +; CHECK-NEXT: v_writelane_b32 v8, s52, 18 +; CHECK-NEXT: v_writelane_b32 v4, s17, 29 +; CHECK-NEXT: v_writelane_b32 v8, s53, 19 +; CHECK-NEXT: v_writelane_b32 v4, s18, 30 +; CHECK-NEXT: v_writelane_b32 v8, s54, 20 +; CHECK-NEXT: v_writelane_b32 v4, s19, 31 ; CHECK-NEXT: s_mov_b32 s4, 48 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v1, s55, 21 +; CHECK-NEXT: v_writelane_b32 v8, s55, 21 ; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v1, s56, 22 -; CHECK-NEXT: v_writelane_b32 v1, s57, 23 -; CHECK-NEXT: v_writelane_b32 v1, s58, 24 -; CHECK-NEXT: v_writelane_b32 v1, s59, 25 -; CHECK-NEXT: v_writelane_b32 v1, s60, 26 +; CHECK-NEXT: v_writelane_b32 v8, s56, 22 +; CHECK-NEXT: v_writelane_b32 v8, s57, 23 +; CHECK-NEXT: v_writelane_b32 v8, s58, 24 +; CHECK-NEXT: v_writelane_b32 v8, s59, 25 +; CHECK-NEXT: v_writelane_b32 v8, s60, 26 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v5, s4, 32 -; CHECK-NEXT: v_writelane_b32 v1, s61, 27 -; CHECK-NEXT: v_writelane_b32 v5, s5, 33 -; CHECK-NEXT: v_writelane_b32 v1, s62, 28 -; CHECK-NEXT: v_writelane_b32 v5, s6, 34 -; CHECK-NEXT: v_writelane_b32 v1, s63, 29 -; CHECK-NEXT: v_writelane_b32 v5, s7, 35 -; CHECK-NEXT: v_writelane_b32 v1, s64, 30 -; CHECK-NEXT: v_writelane_b32 v5, s8, 36 -; CHECK-NEXT: v_writelane_b32 v1, s65, 31 -; CHECK-NEXT: v_writelane_b32 v5, s9, 37 -; CHECK-NEXT: v_writelane_b32 v1, s66, 32 +; CHECK-NEXT: v_writelane_b32 v4, s4, 32 +; CHECK-NEXT: v_writelane_b32 v8, s61, 27 +; CHECK-NEXT: v_writelane_b32 v4, s5, 33 +; CHECK-NEXT: v_writelane_b32 v8, s62, 28 +; CHECK-NEXT: v_writelane_b32 v4, s6, 34 +; CHECK-NEXT: v_writelane_b32 v8, s63, 29 +; CHECK-NEXT: v_writelane_b32 v4, s7, 35 +; CHECK-NEXT: v_writelane_b32 v8, s64, 30 +; CHECK-NEXT: v_writelane_b32 v4, s8, 36 +; CHECK-NEXT: v_writelane_b32 v8, s65, 31 +; CHECK-NEXT: v_writelane_b32 v4, s9, 37 +; CHECK-NEXT: v_writelane_b32 v8, s66, 32 ; CHECK-NEXT: s_movk_i32 s26, 0x1f0 ; CHECK-NEXT: s_movk_i32 s28, 0x2f0 ; CHECK-NEXT: s_mov_b32 s27, s24 ; CHECK-NEXT: s_mov_b32 s29, s24 -; CHECK-NEXT: v_writelane_b32 v5, s10, 38 -; CHECK-NEXT: v_writelane_b32 v1, s67, 33 -; CHECK-NEXT: v_writelane_b32 v5, s11, 39 +; CHECK-NEXT: v_writelane_b32 v4, s10, 38 +; CHECK-NEXT: v_writelane_b32 v8, s67, 33 +; CHECK-NEXT: v_writelane_b32 v4, s11, 39 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v7, v6 +; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 ; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] ; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v5, 0 -; CHECK-NEXT: v_readlane_b32 s44, v5, 8 -; CHECK-NEXT: v_readlane_b32 s45, v5, 9 -; CHECK-NEXT: v_readlane_b32 s46, v5, 10 -; CHECK-NEXT: v_readlane_b32 s47, v5, 11 -; CHECK-NEXT: v_readlane_b32 s48, v5, 12 -; CHECK-NEXT: v_readlane_b32 s49, v5, 13 -; CHECK-NEXT: v_readlane_b32 s50, v5, 14 -; CHECK-NEXT: v_readlane_b32 s51, v5, 15 +; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_readlane_b32 s44, v4, 8 +; CHECK-NEXT: v_readlane_b32 s45, v4, 9 +; CHECK-NEXT: v_readlane_b32 s46, v4, 10 +; CHECK-NEXT: v_readlane_b32 s47, v4, 11 +; CHECK-NEXT: v_readlane_b32 s48, v4, 12 +; CHECK-NEXT: v_readlane_b32 s49, v4, 13 +; CHECK-NEXT: v_readlane_b32 s50, v4, 14 +; CHECK-NEXT: v_readlane_b32 s51, v4, 15 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v5, 1 -; CHECK-NEXT: v_readlane_b32 s38, v5, 2 -; CHECK-NEXT: v_readlane_b32 s39, v5, 3 -; CHECK-NEXT: v_readlane_b32 s40, v5, 4 -; CHECK-NEXT: image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_readlane_b32 s41, v5, 5 -; CHECK-NEXT: v_readlane_b32 s42, v5, 6 -; CHECK-NEXT: v_readlane_b32 s43, v5, 7 +; CHECK-NEXT: v_readlane_b32 s37, v4, 1 +; CHECK-NEXT: v_readlane_b32 s38, v4, 2 +; CHECK-NEXT: v_readlane_b32 s39, v4, 3 +; CHECK-NEXT: v_readlane_b32 s40, v4, 4 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s41, v4, 5 +; CHECK-NEXT: v_readlane_b32 s42, v4, 6 +; CHECK-NEXT: v_readlane_b32 s43, v4, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v5, 32 -; CHECK-NEXT: v_readlane_b32 s40, v5, 36 -; CHECK-NEXT: v_readlane_b32 s41, v5, 37 -; CHECK-NEXT: v_readlane_b32 s42, v5, 38 -; CHECK-NEXT: v_readlane_b32 s43, v5, 39 +; CHECK-NEXT: v_readlane_b32 s36, v4, 32 +; CHECK-NEXT: v_readlane_b32 s40, v4, 36 +; CHECK-NEXT: v_readlane_b32 s41, v4, 37 +; CHECK-NEXT: v_readlane_b32 s42, v4, 38 +; CHECK-NEXT: v_readlane_b32 s43, v4, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v5, 33 -; CHECK-NEXT: v_readlane_b32 s38, v5, 34 +; CHECK-NEXT: v_readlane_b32 s37, v4, 33 +; CHECK-NEXT: v_readlane_b32 s38, v4, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v5, 35 -; CHECK-NEXT: image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s39, v4, 35 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v2, v2, v7 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v0 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v5, 32 -; CHECK-NEXT: v_readlane_b32 s13, v5, 33 -; CHECK-NEXT: v_readlane_b32 s14, v5, 34 -; CHECK-NEXT: v_readlane_b32 s15, v5, 35 -; CHECK-NEXT: v_readlane_b32 s16, v5, 36 -; CHECK-NEXT: v_readlane_b32 s17, v5, 37 -; CHECK-NEXT: v_readlane_b32 s18, v5, 38 -; CHECK-NEXT: v_readlane_b32 s19, v5, 39 -; CHECK-NEXT: v_writelane_b32 v5, s4, 40 -; CHECK-NEXT: v_writelane_b32 v5, s5, 41 -; CHECK-NEXT: v_writelane_b32 v5, s6, 42 -; CHECK-NEXT: v_writelane_b32 v5, s7, 43 -; CHECK-NEXT: v_writelane_b32 v5, s8, 44 -; CHECK-NEXT: v_writelane_b32 v5, s9, 45 -; CHECK-NEXT: v_writelane_b32 v5, s10, 46 -; CHECK-NEXT: v_writelane_b32 v5, s11, 47 -; CHECK-NEXT: v_writelane_b32 v5, s12, 48 -; CHECK-NEXT: v_writelane_b32 v5, s13, 49 -; CHECK-NEXT: v_writelane_b32 v5, s14, 50 -; CHECK-NEXT: v_writelane_b32 v5, s15, 51 -; CHECK-NEXT: v_writelane_b32 v5, s16, 52 -; CHECK-NEXT: v_writelane_b32 v5, s17, 53 -; CHECK-NEXT: v_writelane_b32 v5, s18, 54 -; CHECK-NEXT: v_writelane_b32 v5, s19, 55 -; CHECK-NEXT: v_writelane_b32 v5, s52, 56 -; CHECK-NEXT: v_writelane_b32 v4, s60, 0 -; CHECK-NEXT: v_writelane_b32 v5, s53, 57 -; CHECK-NEXT: v_writelane_b32 v4, s61, 1 -; CHECK-NEXT: v_writelane_b32 v5, s54, 58 -; CHECK-NEXT: v_writelane_b32 v4, s62, 2 -; CHECK-NEXT: v_writelane_b32 v5, s55, 59 -; CHECK-NEXT: v_writelane_b32 v4, s63, 3 -; CHECK-NEXT: v_writelane_b32 v5, s56, 60 -; CHECK-NEXT: v_writelane_b32 v4, s64, 4 -; CHECK-NEXT: v_writelane_b32 v5, s57, 61 -; CHECK-NEXT: v_writelane_b32 v4, s65, 5 -; CHECK-NEXT: v_writelane_b32 v5, s58, 62 -; CHECK-NEXT: v_writelane_b32 v4, s66, 6 -; CHECK-NEXT: v_writelane_b32 v5, s59, 63 -; CHECK-NEXT: v_writelane_b32 v4, s67, 7 +; CHECK-NEXT: v_readlane_b32 s12, v4, 32 +; CHECK-NEXT: v_readlane_b32 s13, v4, 33 +; CHECK-NEXT: v_readlane_b32 s14, v4, 34 +; CHECK-NEXT: v_readlane_b32 s15, v4, 35 +; CHECK-NEXT: v_readlane_b32 s16, v4, 36 +; CHECK-NEXT: v_readlane_b32 s17, v4, 37 +; CHECK-NEXT: v_readlane_b32 s18, v4, 38 +; CHECK-NEXT: v_readlane_b32 s19, v4, 39 +; CHECK-NEXT: v_writelane_b32 v4, s4, 40 +; CHECK-NEXT: v_writelane_b32 v4, s5, 41 +; CHECK-NEXT: v_writelane_b32 v4, s6, 42 +; CHECK-NEXT: v_writelane_b32 v4, s7, 43 +; CHECK-NEXT: v_writelane_b32 v4, s8, 44 +; CHECK-NEXT: v_writelane_b32 v4, s9, 45 +; CHECK-NEXT: v_writelane_b32 v4, s10, 46 +; CHECK-NEXT: v_writelane_b32 v4, s11, 47 +; CHECK-NEXT: v_writelane_b32 v4, s12, 48 +; CHECK-NEXT: v_writelane_b32 v4, s13, 49 +; CHECK-NEXT: v_writelane_b32 v4, s14, 50 +; CHECK-NEXT: v_writelane_b32 v4, s15, 51 +; CHECK-NEXT: v_writelane_b32 v4, s16, 52 +; CHECK-NEXT: v_writelane_b32 v4, s17, 53 +; CHECK-NEXT: v_writelane_b32 v4, s18, 54 +; CHECK-NEXT: v_writelane_b32 v4, s19, 55 +; CHECK-NEXT: v_writelane_b32 v4, s52, 56 +; CHECK-NEXT: v_writelane_b32 v3, s60, 0 +; CHECK-NEXT: v_writelane_b32 v4, s53, 57 +; CHECK-NEXT: v_writelane_b32 v3, s61, 1 +; CHECK-NEXT: v_writelane_b32 v4, s54, 58 +; CHECK-NEXT: v_writelane_b32 v3, s62, 2 +; CHECK-NEXT: v_writelane_b32 v4, s55, 59 +; CHECK-NEXT: v_writelane_b32 v3, s63, 3 +; CHECK-NEXT: v_writelane_b32 v4, s56, 60 +; CHECK-NEXT: v_writelane_b32 v3, s64, 4 +; CHECK-NEXT: v_writelane_b32 v4, s57, 61 +; CHECK-NEXT: v_writelane_b32 v3, s65, 5 +; CHECK-NEXT: v_writelane_b32 v4, s58, 62 +; CHECK-NEXT: v_writelane_b32 v3, s66, 6 +; CHECK-NEXT: v_writelane_b32 v4, s59, 63 +; CHECK-NEXT: v_writelane_b32 v3, s67, 7 ; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 @@ -218,102 +218,102 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s8 -; CHECK-NEXT: v_readlane_b32 s36, v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, s9 +; CHECK-NEXT: v_mov_b32_e32 v0, s8 +; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v5, 1 -; CHECK-NEXT: v_readlane_b32 s38, v5, 2 -; CHECK-NEXT: v_readlane_b32 s39, v5, 3 -; CHECK-NEXT: v_readlane_b32 s40, v5, 4 -; CHECK-NEXT: v_readlane_b32 s41, v5, 5 -; CHECK-NEXT: v_readlane_b32 s42, v5, 6 -; CHECK-NEXT: v_readlane_b32 s43, v5, 7 -; CHECK-NEXT: v_readlane_b32 s44, v5, 8 -; CHECK-NEXT: v_readlane_b32 s45, v5, 9 -; CHECK-NEXT: v_readlane_b32 s46, v5, 10 -; CHECK-NEXT: v_readlane_b32 s47, v5, 11 -; CHECK-NEXT: v_readlane_b32 s48, v5, 12 -; CHECK-NEXT: v_readlane_b32 s49, v5, 13 -; CHECK-NEXT: v_readlane_b32 s50, v5, 14 -; CHECK-NEXT: v_readlane_b32 s51, v5, 15 -; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v5, 16 -; CHECK-NEXT: v_readlane_b32 s44, v5, 24 -; CHECK-NEXT: v_readlane_b32 s45, v5, 25 -; CHECK-NEXT: v_readlane_b32 s46, v5, 26 -; CHECK-NEXT: v_readlane_b32 s47, v5, 27 -; CHECK-NEXT: v_readlane_b32 s48, v5, 28 -; CHECK-NEXT: v_readlane_b32 s49, v5, 29 -; CHECK-NEXT: v_readlane_b32 s50, v5, 30 -; CHECK-NEXT: v_readlane_b32 s51, v5, 31 -; CHECK-NEXT: v_mov_b32_e32 v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v8, v7 -; CHECK-NEXT: v_readlane_b32 s37, v5, 17 -; CHECK-NEXT: v_readlane_b32 s38, v5, 18 -; CHECK-NEXT: v_readlane_b32 s39, v5, 19 -; CHECK-NEXT: image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v5, 20 -; CHECK-NEXT: v_readlane_b32 s41, v5, 21 -; CHECK-NEXT: v_readlane_b32 s42, v5, 22 -; CHECK-NEXT: v_readlane_b32 s43, v5, 23 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_readlane_b32 s37, v4, 1 +; CHECK-NEXT: v_readlane_b32 s38, v4, 2 +; CHECK-NEXT: v_readlane_b32 s39, v4, 3 +; CHECK-NEXT: v_readlane_b32 s40, v4, 4 +; CHECK-NEXT: v_readlane_b32 s41, v4, 5 +; CHECK-NEXT: v_readlane_b32 s42, v4, 6 +; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: v_readlane_b32 s44, v4, 8 +; CHECK-NEXT: v_readlane_b32 s45, v4, 9 +; CHECK-NEXT: v_readlane_b32 s46, v4, 10 +; CHECK-NEXT: v_readlane_b32 s47, v4, 11 +; CHECK-NEXT: v_readlane_b32 s48, v4, 12 +; CHECK-NEXT: v_readlane_b32 s49, v4, 13 +; CHECK-NEXT: v_readlane_b32 s50, v4, 14 +; CHECK-NEXT: v_readlane_b32 s51, v4, 15 +; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s36, v4, 16 +; CHECK-NEXT: v_readlane_b32 s44, v4, 24 +; CHECK-NEXT: v_readlane_b32 s45, v4, 25 +; CHECK-NEXT: v_readlane_b32 s46, v4, 26 +; CHECK-NEXT: v_readlane_b32 s47, v4, 27 +; CHECK-NEXT: v_readlane_b32 s48, v4, 28 +; CHECK-NEXT: v_readlane_b32 s49, v4, 29 +; CHECK-NEXT: v_readlane_b32 s50, v4, 30 +; CHECK-NEXT: v_readlane_b32 s51, v4, 31 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_readlane_b32 s37, v4, 17 +; CHECK-NEXT: v_readlane_b32 s38, v4, 18 +; CHECK-NEXT: v_readlane_b32 s39, v4, 19 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s40, v4, 20 +; CHECK-NEXT: v_readlane_b32 s41, v4, 21 +; CHECK-NEXT: v_readlane_b32 s42, v4, 22 +; CHECK-NEXT: v_readlane_b32 s43, v4, 23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[6:8], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v5, 40 -; CHECK-NEXT: v_readlane_b32 s53, v5, 41 -; CHECK-NEXT: v_readlane_b32 s54, v5, 42 -; CHECK-NEXT: v_readlane_b32 s55, v5, 43 -; CHECK-NEXT: v_readlane_b32 s56, v5, 44 -; CHECK-NEXT: v_readlane_b32 s57, v5, 45 -; CHECK-NEXT: v_readlane_b32 s58, v5, 46 -; CHECK-NEXT: v_readlane_b32 s59, v5, 47 -; CHECK-NEXT: v_readlane_b32 s60, v5, 48 -; CHECK-NEXT: v_readlane_b32 s61, v5, 49 -; CHECK-NEXT: v_readlane_b32 s62, v5, 50 -; CHECK-NEXT: v_readlane_b32 s63, v5, 51 -; CHECK-NEXT: v_readlane_b32 s64, v5, 52 -; CHECK-NEXT: v_readlane_b32 s65, v5, 53 -; CHECK-NEXT: v_readlane_b32 s66, v5, 54 -; CHECK-NEXT: v_readlane_b32 s67, v5, 55 +; CHECK-NEXT: v_readlane_b32 s52, v4, 40 +; CHECK-NEXT: v_readlane_b32 s53, v4, 41 +; CHECK-NEXT: v_readlane_b32 s54, v4, 42 +; CHECK-NEXT: v_readlane_b32 s55, v4, 43 +; CHECK-NEXT: v_readlane_b32 s56, v4, 44 +; CHECK-NEXT: v_readlane_b32 s57, v4, 45 +; CHECK-NEXT: v_readlane_b32 s58, v4, 46 +; CHECK-NEXT: v_readlane_b32 s59, v4, 47 +; CHECK-NEXT: v_readlane_b32 s60, v4, 48 +; CHECK-NEXT: v_readlane_b32 s61, v4, 49 +; CHECK-NEXT: v_readlane_b32 s62, v4, 50 +; CHECK-NEXT: v_readlane_b32 s63, v4, 51 +; CHECK-NEXT: v_readlane_b32 s64, v4, 52 +; CHECK-NEXT: v_readlane_b32 s65, v4, 53 +; CHECK-NEXT: v_readlane_b32 s66, v4, 54 +; CHECK-NEXT: v_readlane_b32 s67, v4, 55 ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_readlane_b32 s36, v5, 56 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_readlane_b32 s36, v4, 56 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: v_readlane_b32 s37, v5, 57 -; CHECK-NEXT: v_readlane_b32 s38, v5, 58 -; CHECK-NEXT: v_readlane_b32 s39, v5, 59 -; CHECK-NEXT: v_readlane_b32 s40, v5, 60 -; CHECK-NEXT: v_readlane_b32 s41, v5, 61 -; CHECK-NEXT: v_readlane_b32 s42, v5, 62 -; CHECK-NEXT: v_readlane_b32 s43, v5, 63 +; CHECK-NEXT: v_mov_b32_e32 v2, s7 +; CHECK-NEXT: v_readlane_b32 s37, v4, 57 +; CHECK-NEXT: v_readlane_b32 s38, v4, 58 +; CHECK-NEXT: v_readlane_b32 s39, v4, 59 +; CHECK-NEXT: v_readlane_b32 s40, v4, 60 +; CHECK-NEXT: v_readlane_b32 s41, v4, 61 +; CHECK-NEXT: v_readlane_b32 s42, v4, 62 +; CHECK-NEXT: v_readlane_b32 s43, v4, 63 ; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v7, v[2:3], s[52:59], s[8:11] dmask:0x1 -; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 ; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 0 -; CHECK-NEXT: v_readlane_b32 s45, v4, 1 -; CHECK-NEXT: v_readlane_b32 s46, v4, 2 -; CHECK-NEXT: v_readlane_b32 s47, v4, 3 -; CHECK-NEXT: v_readlane_b32 s48, v4, 4 -; CHECK-NEXT: v_readlane_b32 s49, v4, 5 -; CHECK-NEXT: v_readlane_b32 s50, v4, 6 -; CHECK-NEXT: v_readlane_b32 s51, v4, 7 +; CHECK-NEXT: v_readlane_b32 s44, v3, 0 +; CHECK-NEXT: v_readlane_b32 s45, v3, 1 +; CHECK-NEXT: v_readlane_b32 s46, v3, 2 +; CHECK-NEXT: v_readlane_b32 s47, v3, 3 +; CHECK-NEXT: v_readlane_b32 s48, v3, 4 +; CHECK-NEXT: v_readlane_b32 s49, v3, 5 +; CHECK-NEXT: v_readlane_b32 s50, v3, 6 +; CHECK-NEXT: v_readlane_b32 s51, v3, 7 ; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] @@ -321,59 +321,59 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v2, v7, v6 -; CHECK-NEXT: v_mul_f32_e32 v0, v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB0_8: ; %bb33 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_f32_e32 v3, v2, v0 -; CHECK-NEXT: v_sub_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_add_f32_e32 v2, v1, v0 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: .LBB0_9: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v1, 33 -; CHECK-NEXT: v_readlane_b32 s66, v1, 32 -; CHECK-NEXT: v_readlane_b32 s65, v1, 31 -; CHECK-NEXT: v_readlane_b32 s64, v1, 30 -; CHECK-NEXT: v_readlane_b32 s63, v1, 29 -; CHECK-NEXT: v_readlane_b32 s62, v1, 28 -; CHECK-NEXT: v_readlane_b32 s61, v1, 27 -; CHECK-NEXT: v_readlane_b32 s60, v1, 26 -; CHECK-NEXT: v_readlane_b32 s59, v1, 25 -; CHECK-NEXT: v_readlane_b32 s58, v1, 24 -; CHECK-NEXT: v_readlane_b32 s57, v1, 23 -; CHECK-NEXT: v_readlane_b32 s56, v1, 22 -; CHECK-NEXT: v_readlane_b32 s55, v1, 21 -; CHECK-NEXT: v_readlane_b32 s54, v1, 20 -; CHECK-NEXT: v_readlane_b32 s53, v1, 19 -; CHECK-NEXT: v_readlane_b32 s52, v1, 18 -; CHECK-NEXT: v_readlane_b32 s51, v1, 17 -; CHECK-NEXT: v_readlane_b32 s50, v1, 16 -; CHECK-NEXT: v_readlane_b32 s49, v1, 15 -; CHECK-NEXT: v_readlane_b32 s48, v1, 14 -; CHECK-NEXT: v_readlane_b32 s47, v1, 13 -; CHECK-NEXT: v_readlane_b32 s46, v1, 12 -; CHECK-NEXT: v_readlane_b32 s45, v1, 11 -; CHECK-NEXT: v_readlane_b32 s44, v1, 10 -; CHECK-NEXT: v_readlane_b32 s43, v1, 9 -; CHECK-NEXT: v_readlane_b32 s42, v1, 8 -; CHECK-NEXT: v_readlane_b32 s41, v1, 7 -; CHECK-NEXT: v_readlane_b32 s40, v1, 6 -; CHECK-NEXT: v_readlane_b32 s39, v1, 5 -; CHECK-NEXT: v_readlane_b32 s38, v1, 4 -; CHECK-NEXT: v_readlane_b32 s37, v1, 3 -; CHECK-NEXT: v_readlane_b32 s36, v1, 2 -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 -; CHECK-NEXT: v_readlane_b32 s30, v1, 0 -; CHECK-NEXT: ; kill: killed $vgpr5 +; CHECK-NEXT: v_readlane_b32 s67, v8, 33 +; CHECK-NEXT: v_readlane_b32 s66, v8, 32 +; CHECK-NEXT: v_readlane_b32 s65, v8, 31 +; CHECK-NEXT: v_readlane_b32 s64, v8, 30 +; CHECK-NEXT: v_readlane_b32 s63, v8, 29 +; CHECK-NEXT: v_readlane_b32 s62, v8, 28 +; CHECK-NEXT: v_readlane_b32 s61, v8, 27 +; CHECK-NEXT: v_readlane_b32 s60, v8, 26 +; CHECK-NEXT: v_readlane_b32 s59, v8, 25 +; CHECK-NEXT: v_readlane_b32 s58, v8, 24 +; CHECK-NEXT: v_readlane_b32 s57, v8, 23 +; CHECK-NEXT: v_readlane_b32 s56, v8, 22 +; CHECK-NEXT: v_readlane_b32 s55, v8, 21 +; CHECK-NEXT: v_readlane_b32 s54, v8, 20 +; CHECK-NEXT: v_readlane_b32 s53, v8, 19 +; CHECK-NEXT: v_readlane_b32 s52, v8, 18 +; CHECK-NEXT: v_readlane_b32 s51, v8, 17 +; CHECK-NEXT: v_readlane_b32 s50, v8, 16 +; CHECK-NEXT: v_readlane_b32 s49, v8, 15 +; CHECK-NEXT: v_readlane_b32 s48, v8, 14 +; CHECK-NEXT: v_readlane_b32 s47, v8, 13 +; CHECK-NEXT: v_readlane_b32 s46, v8, 12 +; CHECK-NEXT: v_readlane_b32 s45, v8, 11 +; CHECK-NEXT: v_readlane_b32 s44, v8, 10 +; CHECK-NEXT: v_readlane_b32 s43, v8, 9 +; CHECK-NEXT: v_readlane_b32 s42, v8, 8 +; CHECK-NEXT: v_readlane_b32 s41, v8, 7 +; CHECK-NEXT: v_readlane_b32 s40, v8, 6 +; CHECK-NEXT: v_readlane_b32 s39, v8, 5 +; CHECK-NEXT: v_readlane_b32 s38, v8, 4 +; CHECK-NEXT: v_readlane_b32 s37, v8, 3 +; CHECK-NEXT: v_readlane_b32 s36, v8, 2 +; CHECK-NEXT: v_readlane_b32 s31, v8, 1 +; CHECK-NEXT: v_readlane_b32 s30, v8, 0 ; CHECK-NEXT: ; kill: killed $vgpr4 +; CHECK-NEXT: ; kill: killed $vgpr3 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 3aaf04c94cda5..408199bbc9223 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -1042,92 +1042,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v41, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s31, 1 +; GCN-NEXT: v_writelane_b32 v41, s34, 2 +; GCN-NEXT: v_writelane_b32 v41, s35, 3 +; GCN-NEXT: v_writelane_b32 v41, s36, 4 +; GCN-NEXT: v_writelane_b32 v41, s37, 5 +; GCN-NEXT: v_writelane_b32 v41, s38, 6 +; GCN-NEXT: v_writelane_b32 v41, s39, 7 +; GCN-NEXT: v_writelane_b32 v41, s40, 8 +; GCN-NEXT: v_writelane_b32 v41, s41, 9 +; GCN-NEXT: v_writelane_b32 v41, s42, 10 +; GCN-NEXT: v_writelane_b32 v41, s43, 11 +; GCN-NEXT: v_writelane_b32 v41, s44, 12 +; GCN-NEXT: v_writelane_b32 v41, s45, 13 +; GCN-NEXT: v_writelane_b32 v41, s46, 14 +; GCN-NEXT: v_writelane_b32 v41, s47, 15 +; GCN-NEXT: v_writelane_b32 v41, s48, 16 +; GCN-NEXT: v_writelane_b32 v41, s49, 17 +; GCN-NEXT: v_writelane_b32 v41, s50, 18 +; GCN-NEXT: v_writelane_b32 v41, s51, 19 +; GCN-NEXT: v_writelane_b32 v41, s52, 20 +; GCN-NEXT: v_writelane_b32 v41, s53, 21 +; GCN-NEXT: v_writelane_b32 v41, s54, 22 +; GCN-NEXT: v_writelane_b32 v41, s55, 23 +; GCN-NEXT: v_writelane_b32 v41, s56, 24 +; GCN-NEXT: v_writelane_b32 v41, s57, 25 +; GCN-NEXT: v_writelane_b32 v41, s58, 26 +; GCN-NEXT: v_writelane_b32 v41, s59, 27 +; GCN-NEXT: v_writelane_b32 v41, s60, 28 +; GCN-NEXT: v_writelane_b32 v41, s61, 29 +; GCN-NEXT: v_writelane_b32 v41, s62, 30 +; GCN-NEXT: v_writelane_b32 v41, s63, 31 +; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v2 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_readlane_b32 s63, v41, 31 +; GCN-NEXT: v_readlane_b32 s62, v41, 30 +; GCN-NEXT: v_readlane_b32 s61, v41, 29 +; GCN-NEXT: v_readlane_b32 s60, v41, 28 +; GCN-NEXT: v_readlane_b32 s59, v41, 27 +; GCN-NEXT: v_readlane_b32 s58, v41, 26 +; GCN-NEXT: v_readlane_b32 s57, v41, 25 +; GCN-NEXT: v_readlane_b32 s56, v41, 24 +; GCN-NEXT: v_readlane_b32 s55, v41, 23 +; GCN-NEXT: v_readlane_b32 s54, v41, 22 +; GCN-NEXT: v_readlane_b32 s53, v41, 21 +; GCN-NEXT: v_readlane_b32 s52, v41, 20 +; GCN-NEXT: v_readlane_b32 s51, v41, 19 +; GCN-NEXT: v_readlane_b32 s50, v41, 18 +; GCN-NEXT: v_readlane_b32 s49, v41, 17 +; GCN-NEXT: v_readlane_b32 s48, v41, 16 +; GCN-NEXT: v_readlane_b32 s47, v41, 15 +; GCN-NEXT: v_readlane_b32 s46, v41, 14 +; GCN-NEXT: v_readlane_b32 s45, v41, 13 +; GCN-NEXT: v_readlane_b32 s44, v41, 12 +; GCN-NEXT: v_readlane_b32 s43, v41, 11 +; GCN-NEXT: v_readlane_b32 s42, v41, 10 +; GCN-NEXT: v_readlane_b32 s41, v41, 9 +; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s39, v41, 7 +; GCN-NEXT: v_readlane_b32 s38, v41, 6 +; GCN-NEXT: v_readlane_b32 s37, v41, 5 +; GCN-NEXT: v_readlane_b32 s36, v41, 4 +; GCN-NEXT: v_readlane_b32 s35, v41, 3 +; GCN-NEXT: v_readlane_b32 s34, v41, 2 +; GCN-NEXT: v_readlane_b32 s31, v41, 1 +; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 @@ -1140,92 +1140,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: v_mov_b32_e32 v41, v0 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: v_writelane_b32 v41, s30, 0 +; GISEL-NEXT: v_writelane_b32 v41, s31, 1 +; GISEL-NEXT: v_writelane_b32 v41, s34, 2 +; GISEL-NEXT: v_writelane_b32 v41, s35, 3 +; GISEL-NEXT: v_writelane_b32 v41, s36, 4 +; GISEL-NEXT: v_writelane_b32 v41, s37, 5 +; GISEL-NEXT: v_writelane_b32 v41, s38, 6 +; GISEL-NEXT: v_writelane_b32 v41, s39, 7 +; GISEL-NEXT: v_writelane_b32 v41, s40, 8 +; GISEL-NEXT: v_writelane_b32 v41, s41, 9 +; GISEL-NEXT: v_writelane_b32 v41, s42, 10 +; GISEL-NEXT: v_writelane_b32 v41, s43, 11 +; GISEL-NEXT: v_writelane_b32 v41, s44, 12 +; GISEL-NEXT: v_writelane_b32 v41, s45, 13 +; GISEL-NEXT: v_writelane_b32 v41, s46, 14 +; GISEL-NEXT: v_writelane_b32 v41, s47, 15 +; GISEL-NEXT: v_writelane_b32 v41, s48, 16 +; GISEL-NEXT: v_writelane_b32 v41, s49, 17 +; GISEL-NEXT: v_writelane_b32 v41, s50, 18 +; GISEL-NEXT: v_writelane_b32 v41, s51, 19 +; GISEL-NEXT: v_writelane_b32 v41, s52, 20 +; GISEL-NEXT: v_writelane_b32 v41, s53, 21 +; GISEL-NEXT: v_writelane_b32 v41, s54, 22 +; GISEL-NEXT: v_writelane_b32 v41, s55, 23 +; GISEL-NEXT: v_writelane_b32 v41, s56, 24 +; GISEL-NEXT: v_writelane_b32 v41, s57, 25 +; GISEL-NEXT: v_writelane_b32 v41, s58, 26 +; GISEL-NEXT: v_writelane_b32 v41, s59, 27 +; GISEL-NEXT: v_writelane_b32 v41, s60, 28 +; GISEL-NEXT: v_writelane_b32 v41, s61, 29 +; GISEL-NEXT: v_writelane_b32 v41, s62, 30 +; GISEL-NEXT: v_writelane_b32 v41, s63, 31 +; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v41 +; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB7_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v0, v41 -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: v_mov_b32_e32 v0, v40 +; GISEL-NEXT: v_readlane_b32 s63, v41, 31 +; GISEL-NEXT: v_readlane_b32 s62, v41, 30 +; GISEL-NEXT: v_readlane_b32 s61, v41, 29 +; GISEL-NEXT: v_readlane_b32 s60, v41, 28 +; GISEL-NEXT: v_readlane_b32 s59, v41, 27 +; GISEL-NEXT: v_readlane_b32 s58, v41, 26 +; GISEL-NEXT: v_readlane_b32 s57, v41, 25 +; GISEL-NEXT: v_readlane_b32 s56, v41, 24 +; GISEL-NEXT: v_readlane_b32 s55, v41, 23 +; GISEL-NEXT: v_readlane_b32 s54, v41, 22 +; GISEL-NEXT: v_readlane_b32 s53, v41, 21 +; GISEL-NEXT: v_readlane_b32 s52, v41, 20 +; GISEL-NEXT: v_readlane_b32 s51, v41, 19 +; GISEL-NEXT: v_readlane_b32 s50, v41, 18 +; GISEL-NEXT: v_readlane_b32 s49, v41, 17 +; GISEL-NEXT: v_readlane_b32 s48, v41, 16 +; GISEL-NEXT: v_readlane_b32 s47, v41, 15 +; GISEL-NEXT: v_readlane_b32 s46, v41, 14 +; GISEL-NEXT: v_readlane_b32 s45, v41, 13 +; GISEL-NEXT: v_readlane_b32 s44, v41, 12 +; GISEL-NEXT: v_readlane_b32 s43, v41, 11 +; GISEL-NEXT: v_readlane_b32 s42, v41, 10 +; GISEL-NEXT: v_readlane_b32 s41, v41, 9 +; GISEL-NEXT: v_readlane_b32 s40, v41, 8 +; GISEL-NEXT: v_readlane_b32 s39, v41, 7 +; GISEL-NEXT: v_readlane_b32 s38, v41, 6 +; GISEL-NEXT: v_readlane_b32 s37, v41, 5 +; GISEL-NEXT: v_readlane_b32 s36, v41, 4 +; GISEL-NEXT: v_readlane_b32 s35, v41, 3 +; GISEL-NEXT: v_readlane_b32 s34, v41, 2 +; GISEL-NEXT: v_readlane_b32 s31, v41, 1 +; GISEL-NEXT: v_readlane_b32 s30, v41, 0 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index cfec77e68eae9..833aba9b26afd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -13,7 +13,7 @@ define fastcc i32 @foo() { ; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32 ; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 @@ -26,8 +26,8 @@ define fastcc i32 @foo() { ; CHECK-NEXT: BUFFER_GL0_INV implicit $exec ; CHECK-NEXT: BUFFER_GL1_INV implicit $exec ; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40 - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40 ; CHECK-NEXT: S_WAITCNT 49279 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo @@ -43,7 +43,7 @@ define fastcc i32 @foo() { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1 ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0 - ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2 + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 killed $vgpr40, 2 ; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 0769337127870..6c8646968b676 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -42,11 +42,11 @@ define amdgpu_kernel void @kernel_call() #0 { ; GCN-LABEL: {{^}}func_regular_call: ; GCN-NOT: buffer_load ; GCN-NOT: readlane -; GCN: flat_load_dword v9 +; GCN: flat_load_dword v8 ; GCN: s_swappc_b64 ; GCN-NOT: buffer_load ; GCN-NOT: readlane -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9 +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: ; NumSgprs: 34 ; GCN: ; NumVgprs: 10 @@ -72,9 +72,9 @@ define void @func_tail_call() #1 { } ; GCN-LABEL: {{^}}func_call_tail_call: -; GCN: flat_load_dword v9 +; GCN: flat_load_dword v8 ; GCN: s_swappc_b64 -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9 +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: s_setpc_b64 ; GCN: ; NumSgprs: 34 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 797b13044e722..1e9994dd8e6ef 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -191,47 +191,47 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: v_writelane_b32 v40, s4, 5 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v43, s4, 5 +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s36, 3 +; GFX9-NEXT: v_writelane_b32 v43, s34, 2 +; GFX9-NEXT: v_writelane_b32 v43, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s37, 4 +; GFX9-NEXT: v_writelane_b32 v43, s37, 4 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: v_mov_b32_e32 v42, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 ; GFX9-NEXT: s_mov_b32 s34, s15 -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 ; GFX9-NEXT: s_mov_b32 s15, s34 -; GFX9-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 +; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 ; GFX9-NEXT: s_mov_b32 s15, s34 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s37, v40, 4 -; GFX9-NEXT: v_readlane_b32 s36, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 5 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s37, v43, 4 +; GFX9-NEXT: v_readlane_b32 s36, v43, 3 +; GFX9-NEXT: v_readlane_b32 s34, v43, 2 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: v_readlane_b32 s4, v43, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index e0efd09f1f14b..4ea77d1d1ac15 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -8,23 +8,23 @@ define void @test_remat_s_getpc_b64() { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: v_writelane_b32 v0, s31, 1 +; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[1:2], off -; GFX9-NEXT: v_readlane_b32 s31, v0, 1 -; GFX9-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_readlane_b32 s31, v2, 1 +; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -33,23 +33,23 @@ define void @test_remat_s_getpc_b64() { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v0, s30, 0 +; GFX11-NEXT: v_writelane_b32 v2, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v0, s31, 1 +; GFX11-NEXT: v_writelane_b32 v2, s31, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_readlane_b32 s31, v0, 1 -; GFX11-NEXT: v_readlane_b32 s30, v0, 0 -; GFX11-NEXT: global_store_b64 v[1:2], v[1:2], off +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_readlane_b32 s31, v2, 1 +; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -62,26 +62,26 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX12-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s30, 0 +; GFX12-NEXT: v_writelane_b32 v2, s30, 0 ; GFX12-NEXT: s_getpc_b64 s[0:1] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_sext_i32_i16 s1, s1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v0, s31, 1 +; GFX12-NEXT: v_writelane_b32 v2, s31, 1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_getpc_b64 s[0:1] ; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s31, v0, 1 -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX12-NEXT: v_readlane_b32 s30, v0, 0 -; GFX12-NEXT: global_store_b64 v[1:2], v[1:2], off +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_readlane_b32 s31, v2, 1 +; GFX12-NEXT: v_readlane_b32 s30, v2, 0 +; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 4544b177351ee..6a2532147f886 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -28,183 +28,183 @@ body: | ; GCN-LABEL: name: test_main ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5 + ; GCN-NEXT: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr0 = COPY $sgpr33 ; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32 ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr3 + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5 ; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc - ; GCN-NEXT: renamable $vgpr5 = IMPLICIT_DEF - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr1 - ; GCN-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr1 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr2 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr3 + ; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr3 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr4 + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr5 + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr5 + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5 + ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5 - ; GCN-NEXT: dead $vgpr4 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc + ; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2 + ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5 + ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5 + ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0 + ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5 + ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 - ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 - ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 - ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 - ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31 - ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30 - ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29 - ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28 - ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27 - ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26 - ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25 - ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24 - ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23 - ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 22 - ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 21 - ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 20 - ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 19 - ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 18 - ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 17 - ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 16 - ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 15 - ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 14 - ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 13 - ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 12 - ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 11 - ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 10 - ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 9 - ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 8 - ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 7 - ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 6 - ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 5 - ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 4 - ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3 - ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2 - ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 31 - ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 30 - ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 29 - ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 28 - ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 27 - ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 26 - ; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 25 - ; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 24 - ; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 23 - ; GCN-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 22 - ; GCN-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 21 - ; GCN-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 20 - ; GCN-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 19 - ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 18 - ; GCN-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 17 - ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 16 - ; GCN-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 15 - ; GCN-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 14 - ; GCN-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 13 - ; GCN-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 12 - ; GCN-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 11 - ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 10 - ; GCN-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 9 - ; GCN-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 8 - ; GCN-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 7 - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 6 - ; GCN-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 5 - ; GCN-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 4 - ; GCN-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 - ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2 - ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0 - ; GCN-NEXT: KILL killed renamable $vgpr5 - ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 + ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3 + ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2 + ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 1 + ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0 + ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 31 + ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 30 + ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 29 + ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 28 + ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 27 + ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 26 + ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 25 + ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 24 + ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 23 + ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 22 + ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 21 + ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 20 + ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 19 + ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 18 + ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 17 + ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 16 + ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 15 + ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 14 + ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 13 + ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 12 + ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 11 + ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 10 + ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 9 + ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 8 + ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 7 + ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 6 + ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 5 + ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 4 + ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3 + ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2 + ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1 + ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0 + ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31 + ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30 + ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29 + ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28 + ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27 + ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26 + ; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25 + ; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24 + ; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23 + ; GCN-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22 + ; GCN-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21 + ; GCN-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20 + ; GCN-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19 + ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18 + ; GCN-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17 + ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16 + ; GCN-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15 + ; GCN-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14 + ; GCN-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13 + ; GCN-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12 + ; GCN-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11 + ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10 + ; GCN-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9 + ; GCN-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8 + ; GCN-NEXT: $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6 + ; GCN-NEXT: $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5 + ; GCN-NEXT: $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 + ; GCN-NEXT: $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 + ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 + ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 + ; GCN-NEXT: KILL killed renamable $vgpr2 + ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4 ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) - ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) - ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) - ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) - ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5) + ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) + ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) + ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) + ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) + ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc ; GCN-NEXT: $sgpr33 = COPY $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 16550fc9588ae..f523b4a2495f1 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -1183,7 +1183,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1297,11 +1297,11 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v4, s34, 0 -; GCN-NEXT: v_writelane_b32 v4, s35, 1 -; GCN-NEXT: v_writelane_b32 v4, s36, 2 -; GCN-NEXT: v_writelane_b32 v4, s37, 3 -; GCN-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NEXT: v_writelane_b32 v5, s34, 0 +; GCN-NEXT: v_writelane_b32 v5, s35, 1 +; GCN-NEXT: v_writelane_b32 v5, s36, 2 +; GCN-NEXT: v_writelane_b32 v5, s37, 3 +; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 @@ -1310,30 +1310,30 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] +; GCN-NEXT: flat_load_dwordx4 v[6:9], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; GCN-NEXT: v_readlane_b32 s37, v4, 3 -; GCN-NEXT: v_readlane_b32 s36, v4, 2 -; GCN-NEXT: v_readlane_b32 s35, v4, 1 -; GCN-NEXT: v_readlane_b32 s34, v4, 0 +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-NEXT: v_readlane_b32 s37, v5, 3 +; GCN-NEXT: v_readlane_b32 s36, v5, 2 +; GCN-NEXT: v_readlane_b32 s35, v5, 1 +; GCN-NEXT: v_readlane_b32 s34, v5, 0 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -1447,7 +1447,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 7a01679f9972c..2c0f64f85d823 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -211,15 +211,15 @@ entry: ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir index d718b49321835..85a615c3d8ae8 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir @@ -10,9 +10,9 @@ body: | bb.0: liveins: $sgpr50 ; CHECK-LABEL: name: spill_csr_sgpr_argument - ; CHECK: liveins: $sgpr50, $vgpr0 + ; CHECK: liveins: $sgpr50, $vgpr63 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr0 + ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr63 ; CHECK-NEXT: S_NOP 0, implicit $sgpr50 ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0 S_NOP 0, implicit $sgpr50 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index efb022ccb0d55..11babc82e919b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -53,41 +53,41 @@ body: | bb.0: liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-LABEL: name: sgpr_spill_lane_crossover - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr0 - ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr0 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index 00564a7db77bc..d2b960fe43f84 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -22,11 +22,11 @@ define void @spill_more_than_wavesize_csr_sgprs() { } ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v0, s98, 63 -; CHECK-DAG: v_writelane_b32 v1, s99, 0 +; CHECK-DAG: v_writelane_b32 v1, s98, 63 +; CHECK-DAG: v_writelane_b32 v2, s99, 0 ; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v1, 0 -; CHECK-DAG: v_readlane_b32 s98, v0, 63 +; CHECK-DAG: v_readlane_b32 s99, v2, 0 +; CHECK-DAG: v_readlane_b32 s98, v1, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index c352229f6a494..d8db2d5319868 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -1585,17 +1585,17 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s16 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s30, 0 -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s31, 1 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s30, 0 +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s31, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, s32 -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s16, 0 +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s16, s16, 5 -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s16, 1 +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1673,18 +1673,18 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v33, 1 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v33, 0 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART ; WAVE32-WWM-PREALLOC-NEXT: ; use s5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v32, 1 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v32, 0 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr33 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 +; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 57e4cec4eccb1..468a8463a06d6 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -252,31 +252,31 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_mov_b32 s16, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[18:19] -; GFX7-NEXT: v_writelane_b32 v40, s16, 2 -; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: v_writelane_b32 v42, s16, 2 +; GFX7-NEXT: v_writelane_b32 v42, s30, 0 ; GFX7-NEXT: s_mov_b32 s17, f16_result@abs32@hi ; GFX7-NEXT: s_mov_b32 s16, f16_result@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: v_mov_b32_e32 v42, v1 -; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v42, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v41, v1 +; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 -; GFX7-NEXT: v_readlane_b32 s30, v40, 0 -; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 +; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: flat_store_short v[41:42], v0 -; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: flat_store_short v[40:41], v0 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 @@ -294,37 +294,37 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_mov_b32 s16, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[18:19] -; GFX7-NEXT: v_writelane_b32 v40, s16, 2 -; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: v_writelane_b32 v42, s16, 2 +; GFX7-NEXT: v_writelane_b32 v42, s30, 0 ; GFX7-NEXT: s_mov_b32 s17, v2f16_result@abs32@hi ; GFX7-NEXT: s_mov_b32 s16, v2f16_result@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: v_mov_b32_e32 v42, v1 -; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v42, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v41, v1 +; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 -; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 +; GFX7-NEXT: v_readlane_b32 s30, v42, 0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: flat_store_dword v[41:42], v0 -; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: flat_store_dword v[40:41], v0 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 @@ -342,18 +342,18 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_mov_b32 s16, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[18:19] -; GFX7-NEXT: v_writelane_b32 v40, s16, 2 -; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: v_writelane_b32 v42, s16, 2 +; GFX7-NEXT: v_writelane_b32 v42, s30, 0 ; GFX7-NEXT: s_mov_b32 s17, v4f16_result@abs32@hi ; GFX7-NEXT: s_mov_b32 s16, v4f16_result@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: v_mov_b32_e32 v42, v1 -; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v42, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v41, v1 +; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -375,17 +375,17 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: flat_store_dword v[41:42], v4 -; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 -; GFX7-NEXT: v_readlane_b32 s30, v40, 0 -; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: flat_store_dword v[40:41], v4 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 +; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 @@ -403,18 +403,18 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_mov_b32 s16, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[18:19] -; GFX7-NEXT: v_writelane_b32 v40, s16, 2 -; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: v_writelane_b32 v42, s16, 2 +; GFX7-NEXT: v_writelane_b32 v42, s30, 0 ; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi ; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: v_mov_b32_e32 v42, v1 -; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v42, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v41, v1 +; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -456,23 +456,23 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v41 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v40 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v41 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v40 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v5 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: flat_store_dword v[41:42], v8 -; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 -; GFX7-NEXT: v_readlane_b32 s30, v40, 0 -; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: flat_store_dword v[40:41], v8 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 +; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index dda41ec131a31..ebbce68221a94 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -9,27 +9,27 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 16 +; GCN-NEXT: v_writelane_b32 v41, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v41, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s31, 1 +; GCN-NEXT: v_writelane_b32 v41, s34, 2 +; GCN-NEXT: v_writelane_b32 v41, s35, 3 +; GCN-NEXT: v_writelane_b32 v41, s36, 4 +; GCN-NEXT: v_writelane_b32 v41, s37, 5 +; GCN-NEXT: v_writelane_b32 v41, s38, 6 +; GCN-NEXT: v_writelane_b32 v41, s39, 7 +; GCN-NEXT: v_writelane_b32 v41, s40, 8 +; GCN-NEXT: v_writelane_b32 v41, s41, 9 +; GCN-NEXT: v_writelane_b32 v41, s42, 10 +; GCN-NEXT: v_writelane_b32 v41, s43, 11 +; GCN-NEXT: v_writelane_b32 v41, s44, 12 +; GCN-NEXT: v_writelane_b32 v41, s45, 13 +; GCN-NEXT: v_writelane_b32 v41, s46, 14 +; GCN-NEXT: v_writelane_b32 v41, s47, 15 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] @@ -58,9 +58,9 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s43, s13 ; GCN-NEXT: s_mov_b32 s44, s14 ; GCN-NEXT: s_mov_b32 s45, s15 -; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_mov_b32 s12, s42 ; GCN-NEXT: s_mov_b32 s13, s43 ; GCN-NEXT: s_mov_b32 s14, s44 @@ -93,26 +93,26 @@ define hidden void @widget() { ; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v40, 16 +; GCN-NEXT: v_readlane_b32 s47, v41, 15 +; GCN-NEXT: v_readlane_b32 s46, v41, 14 +; GCN-NEXT: v_readlane_b32 s45, v41, 13 +; GCN-NEXT: v_readlane_b32 s44, v41, 12 +; GCN-NEXT: v_readlane_b32 s43, v41, 11 +; GCN-NEXT: v_readlane_b32 s42, v41, 10 +; GCN-NEXT: v_readlane_b32 s41, v41, 9 +; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s39, v41, 7 +; GCN-NEXT: v_readlane_b32 s38, v41, 6 +; GCN-NEXT: v_readlane_b32 s37, v41, 5 +; GCN-NEXT: v_readlane_b32 s36, v41, 4 +; GCN-NEXT: v_readlane_b32 s35, v41, 3 +; GCN-NEXT: v_readlane_b32 s34, v41, 2 +; GCN-NEXT: v_readlane_b32 s31, v41, 1 +; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v41, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -257,45 +257,45 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 28 +; GCN-NEXT: v_writelane_b32 v45, s16, 28 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v45, s30, 0 +; GCN-NEXT: v_writelane_b32 v45, s31, 1 +; GCN-NEXT: v_writelane_b32 v45, s34, 2 +; GCN-NEXT: v_writelane_b32 v45, s35, 3 +; GCN-NEXT: v_writelane_b32 v45, s36, 4 +; GCN-NEXT: v_writelane_b32 v45, s37, 5 +; GCN-NEXT: v_writelane_b32 v45, s38, 6 +; GCN-NEXT: v_writelane_b32 v45, s39, 7 +; GCN-NEXT: v_writelane_b32 v45, s40, 8 +; GCN-NEXT: v_writelane_b32 v45, s41, 9 +; GCN-NEXT: v_writelane_b32 v45, s42, 10 +; GCN-NEXT: v_writelane_b32 v45, s43, 11 +; GCN-NEXT: v_writelane_b32 v45, s44, 12 +; GCN-NEXT: v_writelane_b32 v45, s45, 13 +; GCN-NEXT: v_writelane_b32 v45, s46, 14 +; GCN-NEXT: v_writelane_b32 v45, s47, 15 +; GCN-NEXT: v_writelane_b32 v45, s48, 16 +; GCN-NEXT: v_writelane_b32 v45, s49, 17 +; GCN-NEXT: v_writelane_b32 v45, s50, 18 +; GCN-NEXT: v_writelane_b32 v45, s51, 19 +; GCN-NEXT: v_writelane_b32 v45, s52, 20 +; GCN-NEXT: v_writelane_b32 v45, s53, 21 +; GCN-NEXT: v_writelane_b32 v45, s54, 22 +; GCN-NEXT: v_writelane_b32 v45, s55, 23 +; GCN-NEXT: v_writelane_b32 v45, s56, 24 +; GCN-NEXT: v_writelane_b32 v45, s57, 25 +; GCN-NEXT: v_writelane_b32 v45, s58, 26 +; GCN-NEXT: v_writelane_b32 v45, s59, 27 ; GCN-NEXT: s_mov_b64 s[34:35], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s46, s15 ; GCN-NEXT: s_mov_b32 s47, s14 ; GCN-NEXT: s_mov_b32 s48, s13 @@ -305,18 +305,18 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: flat_load_dword v44, v[0:1] +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v43, v[0:1] ; GCN-NEXT: s_mov_b64 s[50:51], 0 ; GCN-NEXT: s_getpc_b64 s[52:53] ; GCN-NEXT: s_add_u32 s52, s52, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s53, s53, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[54:55], 0, v44 -; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v44 -; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 +; GCN-NEXT: v_cmp_eq_f32_e64 s[54:55], 0, v43 +; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v43 +; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -327,8 +327,8 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_18 ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: flat_load_dword v0, v[42:43] -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 @@ -351,7 +351,7 @@ define hidden void @blam() { ; GCN-NEXT: s_mov_b32 s13, s48 ; GCN-NEXT: s_mov_b32 s14, s47 ; GCN-NEXT: s_mov_b32 s15, s46 -; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 @@ -364,7 +364,7 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[8:9], s[54:55], exec ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -376,7 +376,7 @@ define hidden void @blam() { ; GCN-NEXT: ; %bb.8: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: .LBB1_9: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] @@ -406,7 +406,7 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_15 ; GCN-NEXT: ; %bb.14: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec ; GCN-NEXT: .LBB1_15: ; %Flow6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -424,47 +424,47 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_1 ; GCN-NEXT: ; %bb.17: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock ; GCN-NEXT: s_or_b64 exec, exec, s[50:51] -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v40, 28 +; GCN-NEXT: v_readlane_b32 s59, v45, 27 +; GCN-NEXT: v_readlane_b32 s58, v45, 26 +; GCN-NEXT: v_readlane_b32 s57, v45, 25 +; GCN-NEXT: v_readlane_b32 s56, v45, 24 +; GCN-NEXT: v_readlane_b32 s55, v45, 23 +; GCN-NEXT: v_readlane_b32 s54, v45, 22 +; GCN-NEXT: v_readlane_b32 s53, v45, 21 +; GCN-NEXT: v_readlane_b32 s52, v45, 20 +; GCN-NEXT: v_readlane_b32 s51, v45, 19 +; GCN-NEXT: v_readlane_b32 s50, v45, 18 +; GCN-NEXT: v_readlane_b32 s49, v45, 17 +; GCN-NEXT: v_readlane_b32 s48, v45, 16 +; GCN-NEXT: v_readlane_b32 s47, v45, 15 +; GCN-NEXT: v_readlane_b32 s46, v45, 14 +; GCN-NEXT: v_readlane_b32 s45, v45, 13 +; GCN-NEXT: v_readlane_b32 s44, v45, 12 +; GCN-NEXT: v_readlane_b32 s43, v45, 11 +; GCN-NEXT: v_readlane_b32 s42, v45, 10 +; GCN-NEXT: v_readlane_b32 s41, v45, 9 +; GCN-NEXT: v_readlane_b32 s40, v45, 8 +; GCN-NEXT: v_readlane_b32 s39, v45, 7 +; GCN-NEXT: v_readlane_b32 s38, v45, 6 +; GCN-NEXT: v_readlane_b32 s37, v45, 5 +; GCN-NEXT: v_readlane_b32 s36, v45, 4 +; GCN-NEXT: v_readlane_b32 s35, v45, 3 +; GCN-NEXT: v_readlane_b32 s34, v45, 2 +; GCN-NEXT: v_readlane_b32 s31, v45, 1 +; GCN-NEXT: v_readlane_b32 s30, v45, 0 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v45, 28 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll index f3276719ac13c..5ced02f28c977 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll @@ -1,188 +1,214 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s -; RUN: not --crash llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX908-ERR %s +; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90a %s -; This test crashes for gfx908 while allocating the tuple. Compared to the other subtargets, +; This test used to crash for gfx908 while allocating the tuple. Compared to the other subtargets, ; gfx908 marks an extra VGPR reserved for AGPR to VGPR copy that puts more register pressure. - -; GFX908-ERR: error: ran out of registers during register allocation +; To minimize the register pressure, the VGPRs used for CSR SGPR spilling has been picked from the +; higher available range there by allowing more VGPRs available in the lowest range for allocation. define i32 @test_tuple(<16 x i64> %0) { ; GFX900-LABEL: test_tuple: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX900-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX900-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900-NEXT: v_writelane_b32 v31, s36, 0 -; GFX900-NEXT: v_writelane_b32 v31, s37, 1 -; GFX900-NEXT: v_writelane_b32 v31, s38, 2 -; GFX900-NEXT: v_writelane_b32 v31, s39, 3 -; GFX900-NEXT: v_writelane_b32 v31, s40, 4 -; GFX900-NEXT: v_writelane_b32 v31, s41, 5 -; GFX900-NEXT: v_writelane_b32 v31, s42, 6 -; GFX900-NEXT: v_writelane_b32 v31, s43, 7 -; GFX900-NEXT: v_writelane_b32 v31, s44, 8 -; GFX900-NEXT: v_writelane_b32 v31, s45, 9 -; GFX900-NEXT: v_writelane_b32 v31, s46, 10 -; GFX900-NEXT: v_writelane_b32 v31, s47, 11 -; GFX900-NEXT: v_writelane_b32 v31, s48, 12 -; GFX900-NEXT: v_writelane_b32 v31, s49, 13 -; GFX900-NEXT: v_writelane_b32 v31, s50, 14 -; GFX900-NEXT: v_writelane_b32 v31, s51, 15 -; GFX900-NEXT: v_writelane_b32 v31, s52, 16 -; GFX900-NEXT: v_writelane_b32 v31, s53, 17 -; GFX900-NEXT: v_writelane_b32 v31, s54, 18 -; GFX900-NEXT: v_writelane_b32 v31, s55, 19 -; GFX900-NEXT: v_writelane_b32 v31, s56, 20 -; GFX900-NEXT: v_writelane_b32 v31, s57, 21 -; GFX900-NEXT: v_writelane_b32 v31, s58, 22 -; GFX900-NEXT: v_writelane_b32 v31, s59, 23 -; GFX900-NEXT: v_writelane_b32 v31, s60, 24 -; GFX900-NEXT: v_writelane_b32 v31, s61, 25 -; GFX900-NEXT: v_writelane_b32 v31, s62, 26 -; GFX900-NEXT: v_writelane_b32 v31, s63, 27 -; GFX900-NEXT: v_writelane_b32 v31, s64, 28 -; GFX900-NEXT: v_writelane_b32 v31, s65, 29 -; GFX900-NEXT: v_writelane_b32 v31, s66, 30 -; GFX900-NEXT: v_writelane_b32 v31, s67, 31 -; GFX900-NEXT: v_mov_b32_e32 v32, v0 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; implicit-def: $sgpr4 -; GFX900-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec -; GFX900-NEXT: v_mov_b32_e32 v33, v1 -; GFX900-NEXT: v_mov_b32_e32 v34, v2 -; GFX900-NEXT: v_mov_b32_e32 v35, v3 -; GFX900-NEXT: v_mov_b32_e32 v36, v4 -; GFX900-NEXT: v_mov_b32_e32 v37, v5 -; GFX900-NEXT: v_mov_b32_e32 v38, v6 -; GFX900-NEXT: v_mov_b32_e32 v39, v7 -; GFX900-NEXT: v_mov_b32_e32 v40, v8 -; GFX900-NEXT: v_mov_b32_e32 v41, v9 -; GFX900-NEXT: v_mov_b32_e32 v42, v10 -; GFX900-NEXT: v_mov_b32_e32 v43, v11 -; GFX900-NEXT: v_mov_b32_e32 v44, v12 -; GFX900-NEXT: v_mov_b32_e32 v45, v13 -; GFX900-NEXT: v_mov_b32_e32 v46, v14 -; GFX900-NEXT: v_mov_b32_e32 v47, v15 -; GFX900-NEXT: v_mov_b32_e32 v48, v16 -; GFX900-NEXT: v_mov_b32_e32 v49, v17 -; GFX900-NEXT: v_mov_b32_e32 v50, v18 -; GFX900-NEXT: v_mov_b32_e32 v51, v19 -; GFX900-NEXT: v_mov_b32_e32 v52, v20 -; GFX900-NEXT: v_mov_b32_e32 v53, v21 -; GFX900-NEXT: v_mov_b32_e32 v54, v22 -; GFX900-NEXT: v_mov_b32_e32 v55, v23 -; GFX900-NEXT: v_mov_b32_e32 v56, v24 -; GFX900-NEXT: v_mov_b32_e32 v57, v25 -; GFX900-NEXT: v_mov_b32_e32 v58, v26 -; GFX900-NEXT: v_mov_b32_e32 v59, v27 -; GFX900-NEXT: v_mov_b32_e32 v60, v28 -; GFX900-NEXT: v_mov_b32_e32 v61, v29 -; GFX900-NEXT: v_mov_b32_e32 v62, v30 -; GFX900-NEXT: ; kill: def $vgpr63 killed $vgpr0 killed $exec +; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: v_writelane_b32 v63, s36, 0 +; GFX900-NEXT: v_writelane_b32 v63, s37, 1 +; GFX900-NEXT: v_writelane_b32 v63, s38, 2 +; GFX900-NEXT: v_writelane_b32 v63, s39, 3 +; GFX900-NEXT: v_writelane_b32 v63, s40, 4 +; GFX900-NEXT: v_writelane_b32 v63, s41, 5 +; GFX900-NEXT: v_writelane_b32 v63, s42, 6 +; GFX900-NEXT: v_writelane_b32 v63, s43, 7 +; GFX900-NEXT: v_writelane_b32 v63, s44, 8 +; GFX900-NEXT: v_writelane_b32 v63, s45, 9 +; GFX900-NEXT: v_writelane_b32 v63, s46, 10 +; GFX900-NEXT: v_writelane_b32 v63, s47, 11 +; GFX900-NEXT: v_writelane_b32 v63, s48, 12 +; GFX900-NEXT: v_writelane_b32 v63, s49, 13 +; GFX900-NEXT: v_writelane_b32 v63, s50, 14 +; GFX900-NEXT: v_writelane_b32 v63, s51, 15 +; GFX900-NEXT: v_writelane_b32 v63, s52, 16 +; GFX900-NEXT: v_writelane_b32 v63, s53, 17 +; GFX900-NEXT: v_writelane_b32 v63, s54, 18 +; GFX900-NEXT: v_writelane_b32 v63, s55, 19 +; GFX900-NEXT: v_writelane_b32 v63, s56, 20 +; GFX900-NEXT: v_writelane_b32 v63, s57, 21 +; GFX900-NEXT: v_writelane_b32 v63, s58, 22 +; GFX900-NEXT: v_writelane_b32 v63, s59, 23 +; GFX900-NEXT: v_writelane_b32 v63, s60, 24 +; GFX900-NEXT: v_writelane_b32 v63, s61, 25 +; GFX900-NEXT: v_writelane_b32 v63, s62, 26 +; GFX900-NEXT: v_writelane_b32 v63, s63, 27 +; GFX900-NEXT: v_writelane_b32 v63, s64, 28 +; GFX900-NEXT: v_writelane_b32 v63, s65, 29 +; GFX900-NEXT: v_writelane_b32 v63, s66, 30 +; GFX900-NEXT: v_writelane_b32 v63, s67, 31 +; GFX900-NEXT: v_mov_b32_e32 v33, v30 +; GFX900-NEXT: v_mov_b32_e32 v34, v29 +; GFX900-NEXT: v_mov_b32_e32 v35, v28 +; GFX900-NEXT: v_mov_b32_e32 v36, v27 +; GFX900-NEXT: v_mov_b32_e32 v37, v26 +; GFX900-NEXT: v_mov_b32_e32 v38, v25 +; GFX900-NEXT: v_mov_b32_e32 v39, v24 +; GFX900-NEXT: v_mov_b32_e32 v48, v23 +; GFX900-NEXT: v_mov_b32_e32 v49, v22 +; GFX900-NEXT: v_mov_b32_e32 v50, v21 +; GFX900-NEXT: v_mov_b32_e32 v51, v20 +; GFX900-NEXT: v_mov_b32_e32 v52, v19 +; GFX900-NEXT: v_mov_b32_e32 v53, v18 +; GFX900-NEXT: v_mov_b32_e32 v54, v17 +; GFX900-NEXT: v_mov_b32_e32 v55, v16 +; GFX900-NEXT: v_mov_b32_e32 v40, v15 +; GFX900-NEXT: v_mov_b32_e32 v41, v14 +; GFX900-NEXT: v_mov_b32_e32 v42, v13 +; GFX900-NEXT: v_mov_b32_e32 v43, v12 +; GFX900-NEXT: v_mov_b32_e32 v44, v11 +; GFX900-NEXT: v_mov_b32_e32 v45, v10 +; GFX900-NEXT: v_mov_b32_e32 v46, v9 +; GFX900-NEXT: v_mov_b32_e32 v47, v8 +; GFX900-NEXT: v_mov_b32_e32 v56, v7 +; GFX900-NEXT: v_mov_b32_e32 v57, v6 +; GFX900-NEXT: v_mov_b32_e32 v58, v5 +; GFX900-NEXT: v_mov_b32_e32 v59, v4 +; GFX900-NEXT: v_mov_b32_e32 v60, v3 +; GFX900-NEXT: v_mov_b32_e32 v61, v2 +; GFX900-NEXT: v_mov_b32_e32 v62, v1 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; implicit-def: $sgpr4 +; GFX900-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GFX900-NEXT: v_mov_b32_e32 v1, v62 +; GFX900-NEXT: v_mov_b32_e32 v2, v61 +; GFX900-NEXT: v_mov_b32_e32 v3, v60 +; GFX900-NEXT: v_mov_b32_e32 v4, v59 +; GFX900-NEXT: v_mov_b32_e32 v5, v58 +; GFX900-NEXT: v_mov_b32_e32 v6, v57 +; GFX900-NEXT: v_mov_b32_e32 v7, v56 +; GFX900-NEXT: v_mov_b32_e32 v8, v47 +; GFX900-NEXT: v_mov_b32_e32 v9, v46 +; GFX900-NEXT: v_mov_b32_e32 v10, v45 +; GFX900-NEXT: v_mov_b32_e32 v11, v44 +; GFX900-NEXT: v_mov_b32_e32 v12, v43 +; GFX900-NEXT: v_mov_b32_e32 v13, v42 +; GFX900-NEXT: v_mov_b32_e32 v14, v41 +; GFX900-NEXT: v_mov_b32_e32 v15, v40 +; GFX900-NEXT: v_mov_b32_e32 v16, v55 +; GFX900-NEXT: v_mov_b32_e32 v17, v54 +; GFX900-NEXT: v_mov_b32_e32 v18, v53 +; GFX900-NEXT: v_mov_b32_e32 v19, v52 +; GFX900-NEXT: v_mov_b32_e32 v20, v51 +; GFX900-NEXT: v_mov_b32_e32 v21, v50 +; GFX900-NEXT: v_mov_b32_e32 v22, v49 +; GFX900-NEXT: v_mov_b32_e32 v23, v48 +; GFX900-NEXT: v_mov_b32_e32 v24, v39 +; GFX900-NEXT: v_mov_b32_e32 v25, v38 +; GFX900-NEXT: v_mov_b32_e32 v26, v37 +; GFX900-NEXT: v_mov_b32_e32 v27, v36 +; GFX900-NEXT: v_mov_b32_e32 v28, v35 +; GFX900-NEXT: v_mov_b32_e32 v29, v34 +; GFX900-NEXT: v_mov_b32_e32 v30, v33 +; GFX900-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX900-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: v_readlane_b32 s67, v31, 31 -; GFX900-NEXT: v_readlane_b32 s66, v31, 30 -; GFX900-NEXT: v_readlane_b32 s65, v31, 29 -; GFX900-NEXT: v_readlane_b32 s64, v31, 28 -; GFX900-NEXT: v_readlane_b32 s63, v31, 27 -; GFX900-NEXT: v_readlane_b32 s62, v31, 26 -; GFX900-NEXT: v_readlane_b32 s61, v31, 25 -; GFX900-NEXT: v_readlane_b32 s60, v31, 24 -; GFX900-NEXT: v_readlane_b32 s59, v31, 23 -; GFX900-NEXT: v_readlane_b32 s58, v31, 22 -; GFX900-NEXT: v_readlane_b32 s57, v31, 21 -; GFX900-NEXT: v_readlane_b32 s56, v31, 20 -; GFX900-NEXT: v_readlane_b32 s55, v31, 19 -; GFX900-NEXT: v_readlane_b32 s54, v31, 18 -; GFX900-NEXT: v_readlane_b32 s53, v31, 17 -; GFX900-NEXT: v_readlane_b32 s52, v31, 16 -; GFX900-NEXT: v_readlane_b32 s51, v31, 15 -; GFX900-NEXT: v_readlane_b32 s50, v31, 14 -; GFX900-NEXT: v_readlane_b32 s49, v31, 13 -; GFX900-NEXT: v_readlane_b32 s48, v31, 12 -; GFX900-NEXT: v_readlane_b32 s47, v31, 11 -; GFX900-NEXT: v_readlane_b32 s46, v31, 10 -; GFX900-NEXT: v_readlane_b32 s45, v31, 9 -; GFX900-NEXT: v_readlane_b32 s44, v31, 8 -; GFX900-NEXT: v_readlane_b32 s43, v31, 7 -; GFX900-NEXT: v_readlane_b32 s42, v31, 6 -; GFX900-NEXT: v_readlane_b32 s41, v31, 5 -; GFX900-NEXT: v_readlane_b32 s40, v31, 4 -; GFX900-NEXT: v_readlane_b32 s39, v31, 3 -; GFX900-NEXT: v_readlane_b32 s38, v31, 2 -; GFX900-NEXT: v_readlane_b32 s37, v31, 1 -; GFX900-NEXT: v_readlane_b32 s36, v31, 0 -; GFX900-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX900-NEXT: v_readlane_b32 s67, v63, 31 +; GFX900-NEXT: v_readlane_b32 s66, v63, 30 +; GFX900-NEXT: v_readlane_b32 s65, v63, 29 +; GFX900-NEXT: v_readlane_b32 s64, v63, 28 +; GFX900-NEXT: v_readlane_b32 s63, v63, 27 +; GFX900-NEXT: v_readlane_b32 s62, v63, 26 +; GFX900-NEXT: v_readlane_b32 s61, v63, 25 +; GFX900-NEXT: v_readlane_b32 s60, v63, 24 +; GFX900-NEXT: v_readlane_b32 s59, v63, 23 +; GFX900-NEXT: v_readlane_b32 s58, v63, 22 +; GFX900-NEXT: v_readlane_b32 s57, v63, 21 +; GFX900-NEXT: v_readlane_b32 s56, v63, 20 +; GFX900-NEXT: v_readlane_b32 s55, v63, 19 +; GFX900-NEXT: v_readlane_b32 s54, v63, 18 +; GFX900-NEXT: v_readlane_b32 s53, v63, 17 +; GFX900-NEXT: v_readlane_b32 s52, v63, 16 +; GFX900-NEXT: v_readlane_b32 s51, v63, 15 +; GFX900-NEXT: v_readlane_b32 s50, v63, 14 +; GFX900-NEXT: v_readlane_b32 s49, v63, 13 +; GFX900-NEXT: v_readlane_b32 s48, v63, 12 +; GFX900-NEXT: v_readlane_b32 s47, v63, 11 +; GFX900-NEXT: v_readlane_b32 s46, v63, 10 +; GFX900-NEXT: v_readlane_b32 s45, v63, 9 +; GFX900-NEXT: v_readlane_b32 s44, v63, 8 +; GFX900-NEXT: v_readlane_b32 s43, v63, 7 +; GFX900-NEXT: v_readlane_b32 s42, v63, 6 +; GFX900-NEXT: v_readlane_b32 s41, v63, 5 +; GFX900-NEXT: v_readlane_b32 s40, v63, 4 +; GFX900-NEXT: v_readlane_b32 s39, v63, 3 +; GFX900-NEXT: v_readlane_b32 s38, v63, 2 +; GFX900-NEXT: v_readlane_b32 s37, v63, 1 +; GFX900-NEXT: v_readlane_b32 s36, v63, 0 +; GFX900-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX900-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -190,185 +216,416 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-LABEL: test_tuple: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX906-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[4:5] -; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: v_writelane_b32 v31, s36, 0 -; GFX906-NEXT: v_writelane_b32 v31, s37, 1 -; GFX906-NEXT: v_writelane_b32 v31, s38, 2 -; GFX906-NEXT: v_writelane_b32 v31, s39, 3 -; GFX906-NEXT: v_writelane_b32 v31, s40, 4 -; GFX906-NEXT: v_writelane_b32 v31, s41, 5 -; GFX906-NEXT: v_writelane_b32 v31, s42, 6 -; GFX906-NEXT: v_writelane_b32 v31, s43, 7 -; GFX906-NEXT: v_writelane_b32 v31, s44, 8 -; GFX906-NEXT: v_writelane_b32 v31, s45, 9 -; GFX906-NEXT: v_writelane_b32 v31, s46, 10 -; GFX906-NEXT: v_writelane_b32 v31, s47, 11 -; GFX906-NEXT: v_writelane_b32 v31, s48, 12 -; GFX906-NEXT: v_writelane_b32 v31, s49, 13 -; GFX906-NEXT: v_writelane_b32 v31, s50, 14 -; GFX906-NEXT: v_writelane_b32 v31, s51, 15 -; GFX906-NEXT: v_writelane_b32 v31, s52, 16 -; GFX906-NEXT: v_writelane_b32 v31, s53, 17 -; GFX906-NEXT: v_writelane_b32 v31, s54, 18 -; GFX906-NEXT: v_writelane_b32 v31, s55, 19 -; GFX906-NEXT: v_writelane_b32 v31, s56, 20 -; GFX906-NEXT: v_writelane_b32 v31, s57, 21 -; GFX906-NEXT: v_writelane_b32 v31, s58, 22 -; GFX906-NEXT: v_writelane_b32 v31, s59, 23 -; GFX906-NEXT: v_writelane_b32 v31, s60, 24 -; GFX906-NEXT: v_writelane_b32 v31, s61, 25 -; GFX906-NEXT: v_writelane_b32 v31, s62, 26 -; GFX906-NEXT: v_writelane_b32 v31, s63, 27 -; GFX906-NEXT: v_writelane_b32 v31, s64, 28 -; GFX906-NEXT: v_writelane_b32 v31, s65, 29 -; GFX906-NEXT: v_writelane_b32 v31, s66, 30 -; GFX906-NEXT: v_writelane_b32 v31, s67, 31 -; GFX906-NEXT: v_mov_b32_e32 v32, v0 -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr4 -; GFX906-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec -; GFX906-NEXT: v_mov_b32_e32 v33, v1 -; GFX906-NEXT: v_mov_b32_e32 v34, v2 -; GFX906-NEXT: v_mov_b32_e32 v35, v3 -; GFX906-NEXT: v_mov_b32_e32 v36, v4 -; GFX906-NEXT: v_mov_b32_e32 v37, v5 -; GFX906-NEXT: v_mov_b32_e32 v38, v6 -; GFX906-NEXT: v_mov_b32_e32 v39, v7 -; GFX906-NEXT: v_mov_b32_e32 v40, v8 -; GFX906-NEXT: v_mov_b32_e32 v41, v9 -; GFX906-NEXT: v_mov_b32_e32 v42, v10 -; GFX906-NEXT: v_mov_b32_e32 v43, v11 -; GFX906-NEXT: v_mov_b32_e32 v44, v12 -; GFX906-NEXT: v_mov_b32_e32 v45, v13 -; GFX906-NEXT: v_mov_b32_e32 v46, v14 -; GFX906-NEXT: v_mov_b32_e32 v47, v15 -; GFX906-NEXT: v_mov_b32_e32 v48, v16 -; GFX906-NEXT: v_mov_b32_e32 v49, v17 -; GFX906-NEXT: v_mov_b32_e32 v50, v18 -; GFX906-NEXT: v_mov_b32_e32 v51, v19 -; GFX906-NEXT: v_mov_b32_e32 v52, v20 -; GFX906-NEXT: v_mov_b32_e32 v53, v21 -; GFX906-NEXT: v_mov_b32_e32 v54, v22 -; GFX906-NEXT: v_mov_b32_e32 v55, v23 -; GFX906-NEXT: v_mov_b32_e32 v56, v24 -; GFX906-NEXT: v_mov_b32_e32 v57, v25 -; GFX906-NEXT: v_mov_b32_e32 v58, v26 -; GFX906-NEXT: v_mov_b32_e32 v59, v27 -; GFX906-NEXT: v_mov_b32_e32 v60, v28 -; GFX906-NEXT: v_mov_b32_e32 v61, v29 -; GFX906-NEXT: v_mov_b32_e32 v62, v30 -; GFX906-NEXT: ; kill: def $vgpr63 killed $vgpr0 killed $exec +; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: v_writelane_b32 v63, s36, 0 +; GFX906-NEXT: v_writelane_b32 v63, s37, 1 +; GFX906-NEXT: v_writelane_b32 v63, s38, 2 +; GFX906-NEXT: v_writelane_b32 v63, s39, 3 +; GFX906-NEXT: v_writelane_b32 v63, s40, 4 +; GFX906-NEXT: v_writelane_b32 v63, s41, 5 +; GFX906-NEXT: v_writelane_b32 v63, s42, 6 +; GFX906-NEXT: v_writelane_b32 v63, s43, 7 +; GFX906-NEXT: v_writelane_b32 v63, s44, 8 +; GFX906-NEXT: v_writelane_b32 v63, s45, 9 +; GFX906-NEXT: v_writelane_b32 v63, s46, 10 +; GFX906-NEXT: v_writelane_b32 v63, s47, 11 +; GFX906-NEXT: v_writelane_b32 v63, s48, 12 +; GFX906-NEXT: v_writelane_b32 v63, s49, 13 +; GFX906-NEXT: v_writelane_b32 v63, s50, 14 +; GFX906-NEXT: v_writelane_b32 v63, s51, 15 +; GFX906-NEXT: v_writelane_b32 v63, s52, 16 +; GFX906-NEXT: v_writelane_b32 v63, s53, 17 +; GFX906-NEXT: v_writelane_b32 v63, s54, 18 +; GFX906-NEXT: v_writelane_b32 v63, s55, 19 +; GFX906-NEXT: v_writelane_b32 v63, s56, 20 +; GFX906-NEXT: v_writelane_b32 v63, s57, 21 +; GFX906-NEXT: v_writelane_b32 v63, s58, 22 +; GFX906-NEXT: v_writelane_b32 v63, s59, 23 +; GFX906-NEXT: v_writelane_b32 v63, s60, 24 +; GFX906-NEXT: v_writelane_b32 v63, s61, 25 +; GFX906-NEXT: v_writelane_b32 v63, s62, 26 +; GFX906-NEXT: v_writelane_b32 v63, s63, 27 +; GFX906-NEXT: v_writelane_b32 v63, s64, 28 +; GFX906-NEXT: v_writelane_b32 v63, s65, 29 +; GFX906-NEXT: v_writelane_b32 v63, s66, 30 +; GFX906-NEXT: v_writelane_b32 v63, s67, 31 +; GFX906-NEXT: v_mov_b32_e32 v33, v30 +; GFX906-NEXT: v_mov_b32_e32 v34, v29 +; GFX906-NEXT: v_mov_b32_e32 v35, v28 +; GFX906-NEXT: v_mov_b32_e32 v36, v27 +; GFX906-NEXT: v_mov_b32_e32 v37, v26 +; GFX906-NEXT: v_mov_b32_e32 v38, v25 +; GFX906-NEXT: v_mov_b32_e32 v39, v24 +; GFX906-NEXT: v_mov_b32_e32 v48, v23 +; GFX906-NEXT: v_mov_b32_e32 v49, v22 +; GFX906-NEXT: v_mov_b32_e32 v50, v21 +; GFX906-NEXT: v_mov_b32_e32 v51, v20 +; GFX906-NEXT: v_mov_b32_e32 v52, v19 +; GFX906-NEXT: v_mov_b32_e32 v53, v18 +; GFX906-NEXT: v_mov_b32_e32 v54, v17 +; GFX906-NEXT: v_mov_b32_e32 v55, v16 +; GFX906-NEXT: v_mov_b32_e32 v40, v15 +; GFX906-NEXT: v_mov_b32_e32 v41, v14 +; GFX906-NEXT: v_mov_b32_e32 v42, v13 +; GFX906-NEXT: v_mov_b32_e32 v43, v12 +; GFX906-NEXT: v_mov_b32_e32 v44, v11 +; GFX906-NEXT: v_mov_b32_e32 v45, v10 +; GFX906-NEXT: v_mov_b32_e32 v46, v9 +; GFX906-NEXT: v_mov_b32_e32 v47, v8 +; GFX906-NEXT: v_mov_b32_e32 v56, v7 +; GFX906-NEXT: v_mov_b32_e32 v57, v6 +; GFX906-NEXT: v_mov_b32_e32 v58, v5 +; GFX906-NEXT: v_mov_b32_e32 v59, v4 +; GFX906-NEXT: v_mov_b32_e32 v60, v3 +; GFX906-NEXT: v_mov_b32_e32 v61, v2 +; GFX906-NEXT: v_mov_b32_e32 v62, v1 +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; implicit-def: $sgpr4 +; GFX906-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GFX906-NEXT: v_mov_b32_e32 v1, v62 +; GFX906-NEXT: v_mov_b32_e32 v2, v61 +; GFX906-NEXT: v_mov_b32_e32 v3, v60 +; GFX906-NEXT: v_mov_b32_e32 v4, v59 +; GFX906-NEXT: v_mov_b32_e32 v5, v58 +; GFX906-NEXT: v_mov_b32_e32 v6, v57 +; GFX906-NEXT: v_mov_b32_e32 v7, v56 +; GFX906-NEXT: v_mov_b32_e32 v8, v47 +; GFX906-NEXT: v_mov_b32_e32 v9, v46 +; GFX906-NEXT: v_mov_b32_e32 v10, v45 +; GFX906-NEXT: v_mov_b32_e32 v11, v44 +; GFX906-NEXT: v_mov_b32_e32 v12, v43 +; GFX906-NEXT: v_mov_b32_e32 v13, v42 +; GFX906-NEXT: v_mov_b32_e32 v14, v41 +; GFX906-NEXT: v_mov_b32_e32 v15, v40 +; GFX906-NEXT: v_mov_b32_e32 v16, v55 +; GFX906-NEXT: v_mov_b32_e32 v17, v54 +; GFX906-NEXT: v_mov_b32_e32 v18, v53 +; GFX906-NEXT: v_mov_b32_e32 v19, v52 +; GFX906-NEXT: v_mov_b32_e32 v20, v51 +; GFX906-NEXT: v_mov_b32_e32 v21, v50 +; GFX906-NEXT: v_mov_b32_e32 v22, v49 +; GFX906-NEXT: v_mov_b32_e32 v23, v48 +; GFX906-NEXT: v_mov_b32_e32 v24, v39 +; GFX906-NEXT: v_mov_b32_e32 v25, v38 +; GFX906-NEXT: v_mov_b32_e32 v26, v37 +; GFX906-NEXT: v_mov_b32_e32 v27, v36 +; GFX906-NEXT: v_mov_b32_e32 v28, v35 +; GFX906-NEXT: v_mov_b32_e32 v29, v34 +; GFX906-NEXT: v_mov_b32_e32 v30, v33 +; GFX906-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX906-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_readlane_b32 s67, v31, 31 -; GFX906-NEXT: v_readlane_b32 s66, v31, 30 -; GFX906-NEXT: v_readlane_b32 s65, v31, 29 -; GFX906-NEXT: v_readlane_b32 s64, v31, 28 -; GFX906-NEXT: v_readlane_b32 s63, v31, 27 -; GFX906-NEXT: v_readlane_b32 s62, v31, 26 -; GFX906-NEXT: v_readlane_b32 s61, v31, 25 -; GFX906-NEXT: v_readlane_b32 s60, v31, 24 -; GFX906-NEXT: v_readlane_b32 s59, v31, 23 -; GFX906-NEXT: v_readlane_b32 s58, v31, 22 -; GFX906-NEXT: v_readlane_b32 s57, v31, 21 -; GFX906-NEXT: v_readlane_b32 s56, v31, 20 -; GFX906-NEXT: v_readlane_b32 s55, v31, 19 -; GFX906-NEXT: v_readlane_b32 s54, v31, 18 -; GFX906-NEXT: v_readlane_b32 s53, v31, 17 -; GFX906-NEXT: v_readlane_b32 s52, v31, 16 -; GFX906-NEXT: v_readlane_b32 s51, v31, 15 -; GFX906-NEXT: v_readlane_b32 s50, v31, 14 -; GFX906-NEXT: v_readlane_b32 s49, v31, 13 -; GFX906-NEXT: v_readlane_b32 s48, v31, 12 -; GFX906-NEXT: v_readlane_b32 s47, v31, 11 -; GFX906-NEXT: v_readlane_b32 s46, v31, 10 -; GFX906-NEXT: v_readlane_b32 s45, v31, 9 -; GFX906-NEXT: v_readlane_b32 s44, v31, 8 -; GFX906-NEXT: v_readlane_b32 s43, v31, 7 -; GFX906-NEXT: v_readlane_b32 s42, v31, 6 -; GFX906-NEXT: v_readlane_b32 s41, v31, 5 -; GFX906-NEXT: v_readlane_b32 s40, v31, 4 -; GFX906-NEXT: v_readlane_b32 s39, v31, 3 -; GFX906-NEXT: v_readlane_b32 s38, v31, 2 -; GFX906-NEXT: v_readlane_b32 s37, v31, 1 -; GFX906-NEXT: v_readlane_b32 s36, v31, 0 -; GFX906-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: v_readlane_b32 s67, v63, 31 +; GFX906-NEXT: v_readlane_b32 s66, v63, 30 +; GFX906-NEXT: v_readlane_b32 s65, v63, 29 +; GFX906-NEXT: v_readlane_b32 s64, v63, 28 +; GFX906-NEXT: v_readlane_b32 s63, v63, 27 +; GFX906-NEXT: v_readlane_b32 s62, v63, 26 +; GFX906-NEXT: v_readlane_b32 s61, v63, 25 +; GFX906-NEXT: v_readlane_b32 s60, v63, 24 +; GFX906-NEXT: v_readlane_b32 s59, v63, 23 +; GFX906-NEXT: v_readlane_b32 s58, v63, 22 +; GFX906-NEXT: v_readlane_b32 s57, v63, 21 +; GFX906-NEXT: v_readlane_b32 s56, v63, 20 +; GFX906-NEXT: v_readlane_b32 s55, v63, 19 +; GFX906-NEXT: v_readlane_b32 s54, v63, 18 +; GFX906-NEXT: v_readlane_b32 s53, v63, 17 +; GFX906-NEXT: v_readlane_b32 s52, v63, 16 +; GFX906-NEXT: v_readlane_b32 s51, v63, 15 +; GFX906-NEXT: v_readlane_b32 s50, v63, 14 +; GFX906-NEXT: v_readlane_b32 s49, v63, 13 +; GFX906-NEXT: v_readlane_b32 s48, v63, 12 +; GFX906-NEXT: v_readlane_b32 s47, v63, 11 +; GFX906-NEXT: v_readlane_b32 s46, v63, 10 +; GFX906-NEXT: v_readlane_b32 s45, v63, 9 +; GFX906-NEXT: v_readlane_b32 s44, v63, 8 +; GFX906-NEXT: v_readlane_b32 s43, v63, 7 +; GFX906-NEXT: v_readlane_b32 s42, v63, 6 +; GFX906-NEXT: v_readlane_b32 s41, v63, 5 +; GFX906-NEXT: v_readlane_b32 s40, v63, 4 +; GFX906-NEXT: v_readlane_b32 s39, v63, 3 +; GFX906-NEXT: v_readlane_b32 s38, v63, 2 +; GFX906-NEXT: v_readlane_b32 s37, v63, 1 +; GFX906-NEXT: v_readlane_b32 s36, v63, 0 +; GFX906-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX906-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; +; GFX908-LABEL: test_tuple: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX908-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: s_mov_b64 exec, s[4:5] +; GFX908-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX908-NEXT: v_writelane_b32 v62, s36, 0 +; GFX908-NEXT: v_writelane_b32 v62, s37, 1 +; GFX908-NEXT: v_writelane_b32 v62, s38, 2 +; GFX908-NEXT: v_writelane_b32 v62, s39, 3 +; GFX908-NEXT: v_writelane_b32 v62, s40, 4 +; GFX908-NEXT: v_writelane_b32 v62, s41, 5 +; GFX908-NEXT: v_writelane_b32 v62, s42, 6 +; GFX908-NEXT: v_writelane_b32 v62, s43, 7 +; GFX908-NEXT: v_writelane_b32 v62, s44, 8 +; GFX908-NEXT: v_writelane_b32 v62, s45, 9 +; GFX908-NEXT: v_writelane_b32 v62, s46, 10 +; GFX908-NEXT: v_writelane_b32 v62, s47, 11 +; GFX908-NEXT: v_writelane_b32 v62, s48, 12 +; GFX908-NEXT: v_writelane_b32 v62, s49, 13 +; GFX908-NEXT: v_writelane_b32 v62, s50, 14 +; GFX908-NEXT: v_writelane_b32 v62, s51, 15 +; GFX908-NEXT: v_writelane_b32 v62, s52, 16 +; GFX908-NEXT: v_writelane_b32 v62, s53, 17 +; GFX908-NEXT: v_writelane_b32 v62, s54, 18 +; GFX908-NEXT: v_writelane_b32 v62, s55, 19 +; GFX908-NEXT: v_writelane_b32 v62, s56, 20 +; GFX908-NEXT: v_writelane_b32 v62, s57, 21 +; GFX908-NEXT: v_writelane_b32 v62, s58, 22 +; GFX908-NEXT: v_writelane_b32 v62, s59, 23 +; GFX908-NEXT: v_writelane_b32 v62, s60, 24 +; GFX908-NEXT: v_writelane_b32 v62, s61, 25 +; GFX908-NEXT: v_writelane_b32 v62, s62, 26 +; GFX908-NEXT: v_writelane_b32 v62, s63, 27 +; GFX908-NEXT: v_writelane_b32 v62, s64, 28 +; GFX908-NEXT: v_writelane_b32 v62, s65, 29 +; GFX908-NEXT: v_writelane_b32 v62, s66, 30 +; GFX908-NEXT: v_writelane_b32 v62, s67, 31 +; GFX908-NEXT: v_mov_b32_e32 v33, v30 +; GFX908-NEXT: v_mov_b32_e32 v34, v29 +; GFX908-NEXT: v_mov_b32_e32 v35, v28 +; GFX908-NEXT: v_mov_b32_e32 v36, v27 +; GFX908-NEXT: v_mov_b32_e32 v37, v26 +; GFX908-NEXT: v_mov_b32_e32 v38, v25 +; GFX908-NEXT: v_mov_b32_e32 v39, v24 +; GFX908-NEXT: v_mov_b32_e32 v48, v23 +; GFX908-NEXT: v_mov_b32_e32 v49, v22 +; GFX908-NEXT: v_mov_b32_e32 v50, v21 +; GFX908-NEXT: v_mov_b32_e32 v51, v20 +; GFX908-NEXT: v_mov_b32_e32 v52, v19 +; GFX908-NEXT: v_mov_b32_e32 v53, v18 +; GFX908-NEXT: v_mov_b32_e32 v54, v17 +; GFX908-NEXT: v_mov_b32_e32 v55, v16 +; GFX908-NEXT: v_mov_b32_e32 v40, v15 +; GFX908-NEXT: v_mov_b32_e32 v41, v14 +; GFX908-NEXT: v_mov_b32_e32 v42, v13 +; GFX908-NEXT: v_mov_b32_e32 v43, v12 +; GFX908-NEXT: v_mov_b32_e32 v44, v11 +; GFX908-NEXT: v_mov_b32_e32 v45, v10 +; GFX908-NEXT: v_mov_b32_e32 v46, v9 +; GFX908-NEXT: v_mov_b32_e32 v47, v8 +; GFX908-NEXT: v_mov_b32_e32 v56, v7 +; GFX908-NEXT: v_mov_b32_e32 v57, v6 +; GFX908-NEXT: v_mov_b32_e32 v58, v5 +; GFX908-NEXT: v_mov_b32_e32 v59, v4 +; GFX908-NEXT: v_mov_b32_e32 v60, v3 +; GFX908-NEXT: v_mov_b32_e32 v61, v2 +; GFX908-NEXT: v_mov_b32_e32 v32, v1 +; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a14, v1 ; Reload Reuse +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; implicit-def: $sgpr4 +; GFX908-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GFX908-NEXT: v_mov_b32_e32 v1, v32 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a14 ; Reload Reuse +; GFX908-NEXT: v_mov_b32_e32 v2, v61 +; GFX908-NEXT: v_mov_b32_e32 v3, v60 +; GFX908-NEXT: v_mov_b32_e32 v4, v59 +; GFX908-NEXT: v_mov_b32_e32 v5, v58 +; GFX908-NEXT: v_mov_b32_e32 v6, v57 +; GFX908-NEXT: v_mov_b32_e32 v7, v56 +; GFX908-NEXT: v_mov_b32_e32 v8, v47 +; GFX908-NEXT: v_mov_b32_e32 v9, v46 +; GFX908-NEXT: v_mov_b32_e32 v10, v45 +; GFX908-NEXT: v_mov_b32_e32 v11, v44 +; GFX908-NEXT: v_mov_b32_e32 v12, v43 +; GFX908-NEXT: v_mov_b32_e32 v13, v42 +; GFX908-NEXT: v_mov_b32_e32 v14, v41 +; GFX908-NEXT: v_mov_b32_e32 v15, v40 +; GFX908-NEXT: v_mov_b32_e32 v16, v55 +; GFX908-NEXT: v_mov_b32_e32 v17, v54 +; GFX908-NEXT: v_mov_b32_e32 v18, v53 +; GFX908-NEXT: v_mov_b32_e32 v19, v52 +; GFX908-NEXT: v_mov_b32_e32 v20, v51 +; GFX908-NEXT: v_mov_b32_e32 v21, v50 +; GFX908-NEXT: v_mov_b32_e32 v22, v49 +; GFX908-NEXT: v_mov_b32_e32 v23, v48 +; GFX908-NEXT: v_mov_b32_e32 v24, v39 +; GFX908-NEXT: v_mov_b32_e32 v25, v38 +; GFX908-NEXT: v_mov_b32_e32 v26, v37 +; GFX908-NEXT: v_mov_b32_e32 v27, v36 +; GFX908-NEXT: v_mov_b32_e32 v28, v35 +; GFX908-NEXT: v_mov_b32_e32 v29, v34 +; GFX908-NEXT: v_mov_b32_e32 v30, v33 +; GFX908-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec +; GFX908-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_readlane_b32 s67, v62, 31 +; GFX908-NEXT: v_readlane_b32 s66, v62, 30 +; GFX908-NEXT: v_readlane_b32 s65, v62, 29 +; GFX908-NEXT: v_readlane_b32 s64, v62, 28 +; GFX908-NEXT: v_readlane_b32 s63, v62, 27 +; GFX908-NEXT: v_readlane_b32 s62, v62, 26 +; GFX908-NEXT: v_readlane_b32 s61, v62, 25 +; GFX908-NEXT: v_readlane_b32 s60, v62, 24 +; GFX908-NEXT: v_readlane_b32 s59, v62, 23 +; GFX908-NEXT: v_readlane_b32 s58, v62, 22 +; GFX908-NEXT: v_readlane_b32 s57, v62, 21 +; GFX908-NEXT: v_readlane_b32 s56, v62, 20 +; GFX908-NEXT: v_readlane_b32 s55, v62, 19 +; GFX908-NEXT: v_readlane_b32 s54, v62, 18 +; GFX908-NEXT: v_readlane_b32 s53, v62, 17 +; GFX908-NEXT: v_readlane_b32 s52, v62, 16 +; GFX908-NEXT: v_readlane_b32 s51, v62, 15 +; GFX908-NEXT: v_readlane_b32 s50, v62, 14 +; GFX908-NEXT: v_readlane_b32 s49, v62, 13 +; GFX908-NEXT: v_readlane_b32 s48, v62, 12 +; GFX908-NEXT: v_readlane_b32 s47, v62, 11 +; GFX908-NEXT: v_readlane_b32 s46, v62, 10 +; GFX908-NEXT: v_readlane_b32 s45, v62, 9 +; GFX908-NEXT: v_readlane_b32 s44, v62, 8 +; GFX908-NEXT: v_readlane_b32 s43, v62, 7 +; GFX908-NEXT: v_readlane_b32 s42, v62, 6 +; GFX908-NEXT: v_readlane_b32 s41, v62, 5 +; GFX908-NEXT: v_readlane_b32 s40, v62, 4 +; GFX908-NEXT: v_readlane_b32 s39, v62, 3 +; GFX908-NEXT: v_readlane_b32 s38, v62, 2 +; GFX908-NEXT: v_readlane_b32 s37, v62, 1 +; GFX908-NEXT: v_readlane_b32 s36, v62, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX908-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX908-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b64 exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] +; ; GFX90a-LABEL: test_tuple: ; GFX90a: ; %bb.0: ; GFX90a-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90a-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX90a-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90a-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX90a-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX90a-NEXT: s_mov_b64 exec, s[4:5] ; GFX90a-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse @@ -385,41 +642,69 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse -; GFX90a-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse -; GFX90a-NEXT: v_writelane_b32 v31, s36, 0 -; GFX90a-NEXT: v_writelane_b32 v31, s37, 1 -; GFX90a-NEXT: v_writelane_b32 v31, s38, 2 -; GFX90a-NEXT: v_writelane_b32 v31, s39, 3 -; GFX90a-NEXT: v_writelane_b32 v31, s40, 4 -; GFX90a-NEXT: v_writelane_b32 v31, s41, 5 -; GFX90a-NEXT: v_writelane_b32 v31, s42, 6 -; GFX90a-NEXT: v_writelane_b32 v31, s43, 7 -; GFX90a-NEXT: v_writelane_b32 v31, s44, 8 -; GFX90a-NEXT: v_writelane_b32 v31, s45, 9 -; GFX90a-NEXT: v_writelane_b32 v31, s46, 10 -; GFX90a-NEXT: v_writelane_b32 v31, s47, 11 -; GFX90a-NEXT: v_writelane_b32 v31, s48, 12 -; GFX90a-NEXT: v_writelane_b32 v31, s49, 13 -; GFX90a-NEXT: v_writelane_b32 v31, s50, 14 -; GFX90a-NEXT: v_writelane_b32 v31, s51, 15 -; GFX90a-NEXT: v_writelane_b32 v31, s52, 16 -; GFX90a-NEXT: v_writelane_b32 v31, s53, 17 -; GFX90a-NEXT: v_writelane_b32 v31, s54, 18 -; GFX90a-NEXT: v_writelane_b32 v31, s55, 19 -; GFX90a-NEXT: v_writelane_b32 v31, s56, 20 -; GFX90a-NEXT: v_writelane_b32 v31, s57, 21 -; GFX90a-NEXT: v_writelane_b32 v31, s58, 22 -; GFX90a-NEXT: v_writelane_b32 v31, s59, 23 -; GFX90a-NEXT: v_writelane_b32 v31, s60, 24 -; GFX90a-NEXT: v_writelane_b32 v31, s61, 25 -; GFX90a-NEXT: v_writelane_b32 v31, s62, 26 -; GFX90a-NEXT: v_writelane_b32 v31, s63, 27 -; GFX90a-NEXT: v_writelane_b32 v31, s64, 28 -; GFX90a-NEXT: v_writelane_b32 v31, s65, 29 -; GFX90a-NEXT: v_writelane_b32 v31, s66, 30 -; GFX90a-NEXT: v_writelane_b32 v31, s67, 31 -; GFX90a-NEXT: v_mov_b32_e32 v32, v0 -; GFX90a-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX90a-NEXT: v_writelane_b32 v63, s36, 0 +; GFX90a-NEXT: v_writelane_b32 v63, s37, 1 +; GFX90a-NEXT: v_writelane_b32 v63, s38, 2 +; GFX90a-NEXT: v_writelane_b32 v63, s39, 3 +; GFX90a-NEXT: v_writelane_b32 v63, s40, 4 +; GFX90a-NEXT: v_writelane_b32 v63, s41, 5 +; GFX90a-NEXT: v_writelane_b32 v63, s42, 6 +; GFX90a-NEXT: v_writelane_b32 v63, s43, 7 +; GFX90a-NEXT: v_writelane_b32 v63, s44, 8 +; GFX90a-NEXT: v_writelane_b32 v63, s45, 9 +; GFX90a-NEXT: v_writelane_b32 v63, s46, 10 +; GFX90a-NEXT: v_writelane_b32 v63, s47, 11 +; GFX90a-NEXT: v_writelane_b32 v63, s48, 12 +; GFX90a-NEXT: v_writelane_b32 v63, s49, 13 +; GFX90a-NEXT: v_writelane_b32 v63, s50, 14 +; GFX90a-NEXT: v_writelane_b32 v63, s51, 15 +; GFX90a-NEXT: v_writelane_b32 v63, s52, 16 +; GFX90a-NEXT: v_writelane_b32 v63, s53, 17 +; GFX90a-NEXT: v_writelane_b32 v63, s54, 18 +; GFX90a-NEXT: v_writelane_b32 v63, s55, 19 +; GFX90a-NEXT: v_writelane_b32 v63, s56, 20 +; GFX90a-NEXT: v_writelane_b32 v63, s57, 21 +; GFX90a-NEXT: v_writelane_b32 v63, s58, 22 +; GFX90a-NEXT: v_writelane_b32 v63, s59, 23 +; GFX90a-NEXT: v_writelane_b32 v63, s60, 24 +; GFX90a-NEXT: v_writelane_b32 v63, s61, 25 +; GFX90a-NEXT: v_writelane_b32 v63, s62, 26 +; GFX90a-NEXT: v_writelane_b32 v63, s63, 27 +; GFX90a-NEXT: v_writelane_b32 v63, s64, 28 +; GFX90a-NEXT: v_writelane_b32 v63, s65, 29 +; GFX90a-NEXT: v_writelane_b32 v63, s66, 30 +; GFX90a-NEXT: v_writelane_b32 v63, s67, 31 +; GFX90a-NEXT: v_mov_b32_e32 v33, v30 +; GFX90a-NEXT: v_mov_b32_e32 v34, v29 +; GFX90a-NEXT: v_mov_b32_e32 v35, v28 +; GFX90a-NEXT: v_mov_b32_e32 v36, v27 +; GFX90a-NEXT: v_mov_b32_e32 v37, v26 +; GFX90a-NEXT: v_mov_b32_e32 v38, v25 +; GFX90a-NEXT: v_mov_b32_e32 v39, v24 +; GFX90a-NEXT: v_mov_b32_e32 v48, v23 +; GFX90a-NEXT: v_mov_b32_e32 v49, v22 +; GFX90a-NEXT: v_mov_b32_e32 v50, v21 +; GFX90a-NEXT: v_mov_b32_e32 v51, v20 +; GFX90a-NEXT: v_mov_b32_e32 v52, v19 +; GFX90a-NEXT: v_mov_b32_e32 v53, v18 +; GFX90a-NEXT: v_mov_b32_e32 v54, v17 +; GFX90a-NEXT: v_mov_b32_e32 v55, v16 +; GFX90a-NEXT: v_mov_b32_e32 v40, v15 +; GFX90a-NEXT: v_mov_b32_e32 v41, v14 +; GFX90a-NEXT: v_mov_b32_e32 v42, v13 +; GFX90a-NEXT: v_mov_b32_e32 v43, v12 +; GFX90a-NEXT: v_mov_b32_e32 v44, v11 +; GFX90a-NEXT: v_mov_b32_e32 v45, v10 +; GFX90a-NEXT: v_mov_b32_e32 v46, v9 +; GFX90a-NEXT: v_mov_b32_e32 v47, v8 +; GFX90a-NEXT: v_mov_b32_e32 v56, v7 +; GFX90a-NEXT: v_mov_b32_e32 v57, v6 +; GFX90a-NEXT: v_mov_b32_e32 v58, v5 +; GFX90a-NEXT: v_mov_b32_e32 v59, v4 +; GFX90a-NEXT: v_mov_b32_e32 v60, v3 +; GFX90a-NEXT: v_mov_b32_e32 v61, v2 +; GFX90a-NEXT: v_mov_b32_e32 v62, v1 +; GFX90a-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX90a-NEXT: ; implicit-def: $sgpr4 ; GFX90a-NEXT: ; implicit-def: $sgpr4 ; GFX90a-NEXT: ; implicit-def: $sgpr4 @@ -452,74 +737,72 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: ; implicit-def: $sgpr4 ; GFX90a-NEXT: ; implicit-def: $sgpr4 ; GFX90a-NEXT: ; implicit-def: $sgpr4 -; GFX90a-NEXT: ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec -; GFX90a-NEXT: v_mov_b32_e32 v33, v1 -; GFX90a-NEXT: v_mov_b32_e32 v34, v2 -; GFX90a-NEXT: v_mov_b32_e32 v35, v3 -; GFX90a-NEXT: v_mov_b32_e32 v36, v4 -; GFX90a-NEXT: v_mov_b32_e32 v37, v5 -; GFX90a-NEXT: v_mov_b32_e32 v38, v6 -; GFX90a-NEXT: v_mov_b32_e32 v39, v7 -; GFX90a-NEXT: v_mov_b32_e32 v40, v8 -; GFX90a-NEXT: v_mov_b32_e32 v41, v9 -; GFX90a-NEXT: v_mov_b32_e32 v42, v10 -; GFX90a-NEXT: v_mov_b32_e32 v43, v11 -; GFX90a-NEXT: v_mov_b32_e32 v44, v12 -; GFX90a-NEXT: v_mov_b32_e32 v45, v13 -; GFX90a-NEXT: v_mov_b32_e32 v46, v14 -; GFX90a-NEXT: v_mov_b32_e32 v47, v15 -; GFX90a-NEXT: v_mov_b32_e32 v48, v16 -; GFX90a-NEXT: v_mov_b32_e32 v49, v17 -; GFX90a-NEXT: v_mov_b32_e32 v50, v18 -; GFX90a-NEXT: v_mov_b32_e32 v51, v19 -; GFX90a-NEXT: v_mov_b32_e32 v52, v20 -; GFX90a-NEXT: v_mov_b32_e32 v53, v21 -; GFX90a-NEXT: v_mov_b32_e32 v54, v22 -; GFX90a-NEXT: v_mov_b32_e32 v55, v23 -; GFX90a-NEXT: v_mov_b32_e32 v56, v24 -; GFX90a-NEXT: v_mov_b32_e32 v57, v25 -; GFX90a-NEXT: v_mov_b32_e32 v58, v26 -; GFX90a-NEXT: v_mov_b32_e32 v59, v27 -; GFX90a-NEXT: v_mov_b32_e32 v60, v28 -; GFX90a-NEXT: v_mov_b32_e32 v61, v29 -; GFX90a-NEXT: v_mov_b32_e32 v62, v30 -; GFX90a-NEXT: ; kill: def $vgpr63 killed $vgpr0 killed $exec +; GFX90a-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GFX90a-NEXT: v_mov_b32_e32 v1, v62 +; GFX90a-NEXT: v_mov_b32_e32 v2, v61 +; GFX90a-NEXT: v_mov_b32_e32 v3, v60 +; GFX90a-NEXT: v_mov_b32_e32 v4, v59 +; GFX90a-NEXT: v_mov_b32_e32 v5, v58 +; GFX90a-NEXT: v_mov_b32_e32 v6, v57 +; GFX90a-NEXT: v_mov_b32_e32 v7, v56 +; GFX90a-NEXT: v_mov_b32_e32 v8, v47 +; GFX90a-NEXT: v_mov_b32_e32 v9, v46 +; GFX90a-NEXT: v_mov_b32_e32 v10, v45 +; GFX90a-NEXT: v_mov_b32_e32 v11, v44 +; GFX90a-NEXT: v_mov_b32_e32 v12, v43 +; GFX90a-NEXT: v_mov_b32_e32 v13, v42 +; GFX90a-NEXT: v_mov_b32_e32 v14, v41 +; GFX90a-NEXT: v_mov_b32_e32 v15, v40 +; GFX90a-NEXT: v_mov_b32_e32 v16, v55 +; GFX90a-NEXT: v_mov_b32_e32 v17, v54 +; GFX90a-NEXT: v_mov_b32_e32 v18, v53 +; GFX90a-NEXT: v_mov_b32_e32 v19, v52 +; GFX90a-NEXT: v_mov_b32_e32 v20, v51 +; GFX90a-NEXT: v_mov_b32_e32 v21, v50 +; GFX90a-NEXT: v_mov_b32_e32 v22, v49 +; GFX90a-NEXT: v_mov_b32_e32 v23, v48 +; GFX90a-NEXT: v_mov_b32_e32 v24, v39 +; GFX90a-NEXT: v_mov_b32_e32 v25, v38 +; GFX90a-NEXT: v_mov_b32_e32 v26, v37 +; GFX90a-NEXT: v_mov_b32_e32 v27, v36 +; GFX90a-NEXT: v_mov_b32_e32 v28, v35 +; GFX90a-NEXT: v_mov_b32_e32 v29, v34 +; GFX90a-NEXT: v_mov_b32_e32 v30, v33 +; GFX90a-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX90a-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_readlane_b32 s67, v31, 31 -; GFX90a-NEXT: v_readlane_b32 s66, v31, 30 -; GFX90a-NEXT: v_readlane_b32 s65, v31, 29 -; GFX90a-NEXT: v_readlane_b32 s64, v31, 28 -; GFX90a-NEXT: v_readlane_b32 s63, v31, 27 -; GFX90a-NEXT: v_readlane_b32 s62, v31, 26 -; GFX90a-NEXT: v_readlane_b32 s61, v31, 25 -; GFX90a-NEXT: v_readlane_b32 s60, v31, 24 -; GFX90a-NEXT: v_readlane_b32 s59, v31, 23 -; GFX90a-NEXT: v_readlane_b32 s58, v31, 22 -; GFX90a-NEXT: v_readlane_b32 s57, v31, 21 -; GFX90a-NEXT: v_readlane_b32 s56, v31, 20 -; GFX90a-NEXT: v_readlane_b32 s55, v31, 19 -; GFX90a-NEXT: v_readlane_b32 s54, v31, 18 -; GFX90a-NEXT: v_readlane_b32 s53, v31, 17 -; GFX90a-NEXT: v_readlane_b32 s52, v31, 16 -; GFX90a-NEXT: v_readlane_b32 s51, v31, 15 -; GFX90a-NEXT: v_readlane_b32 s50, v31, 14 -; GFX90a-NEXT: v_readlane_b32 s49, v31, 13 -; GFX90a-NEXT: v_readlane_b32 s48, v31, 12 -; GFX90a-NEXT: v_readlane_b32 s47, v31, 11 -; GFX90a-NEXT: v_readlane_b32 s46, v31, 10 -; GFX90a-NEXT: v_readlane_b32 s45, v31, 9 -; GFX90a-NEXT: v_readlane_b32 s44, v31, 8 -; GFX90a-NEXT: v_readlane_b32 s43, v31, 7 -; GFX90a-NEXT: v_readlane_b32 s42, v31, 6 -; GFX90a-NEXT: v_readlane_b32 s41, v31, 5 -; GFX90a-NEXT: v_readlane_b32 s40, v31, 4 -; GFX90a-NEXT: v_readlane_b32 s39, v31, 3 -; GFX90a-NEXT: v_readlane_b32 s38, v31, 2 -; GFX90a-NEXT: v_readlane_b32 s37, v31, 1 -; GFX90a-NEXT: v_readlane_b32 s36, v31, 0 -; GFX90a-NEXT: v_accvgpr_read_b32 v63, a15 ; Reload Reuse +; GFX90a-NEXT: v_readlane_b32 s67, v63, 31 +; GFX90a-NEXT: v_readlane_b32 s66, v63, 30 +; GFX90a-NEXT: v_readlane_b32 s65, v63, 29 +; GFX90a-NEXT: v_readlane_b32 s64, v63, 28 +; GFX90a-NEXT: v_readlane_b32 s63, v63, 27 +; GFX90a-NEXT: v_readlane_b32 s62, v63, 26 +; GFX90a-NEXT: v_readlane_b32 s61, v63, 25 +; GFX90a-NEXT: v_readlane_b32 s60, v63, 24 +; GFX90a-NEXT: v_readlane_b32 s59, v63, 23 +; GFX90a-NEXT: v_readlane_b32 s58, v63, 22 +; GFX90a-NEXT: v_readlane_b32 s57, v63, 21 +; GFX90a-NEXT: v_readlane_b32 s56, v63, 20 +; GFX90a-NEXT: v_readlane_b32 s55, v63, 19 +; GFX90a-NEXT: v_readlane_b32 s54, v63, 18 +; GFX90a-NEXT: v_readlane_b32 s53, v63, 17 +; GFX90a-NEXT: v_readlane_b32 s52, v63, 16 +; GFX90a-NEXT: v_readlane_b32 s51, v63, 15 +; GFX90a-NEXT: v_readlane_b32 s50, v63, 14 +; GFX90a-NEXT: v_readlane_b32 s49, v63, 13 +; GFX90a-NEXT: v_readlane_b32 s48, v63, 12 +; GFX90a-NEXT: v_readlane_b32 s47, v63, 11 +; GFX90a-NEXT: v_readlane_b32 s46, v63, 10 +; GFX90a-NEXT: v_readlane_b32 s45, v63, 9 +; GFX90a-NEXT: v_readlane_b32 s44, v63, 8 +; GFX90a-NEXT: v_readlane_b32 s43, v63, 7 +; GFX90a-NEXT: v_readlane_b32 s42, v63, 6 +; GFX90a-NEXT: v_readlane_b32 s41, v63, 5 +; GFX90a-NEXT: v_readlane_b32 s40, v63, 4 +; GFX90a-NEXT: v_readlane_b32 s39, v63, 3 +; GFX90a-NEXT: v_readlane_b32 s38, v63, 2 +; GFX90a-NEXT: v_readlane_b32 s37, v63, 1 +; GFX90a-NEXT: v_readlane_b32 s36, v63, 0 ; GFX90a-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse @@ -535,8 +818,8 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse ; GFX90a-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse -; GFX90a-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX90a-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90a-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX90a-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX90a-NEXT: s_mov_b64 exec, s[4:5] ; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 92efbe5a71826..2b96e10fd3cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,17 +14,17 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART @@ -33,30 +33,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: v_writelane_b32 v44, s4, 2 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v41 -; GFX9-NEXT: v_mov_b32_e32 v1, v42 -; GFX9-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-NEXT: v_mov_b32_e32 v3, v44 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mov_b32_e32 v1, v41 +; GFX9-NEXT: v_mov_b32_e32 v2, v42 +; GFX9-NEXT: v_mov_b32_e32 v3, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: v_readlane_b32 s4, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -69,7 +69,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 @@ -77,10 +77,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART @@ -89,31 +89,31 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s4, 2 +; GFX10-NEXT: v_writelane_b32 v44, s4, 2 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v44, s30, 0 +; GFX10-NEXT: v_writelane_b32 v44, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v41 -; GFX10-NEXT: v_mov_b32_e32 v1, v42 -; GFX10-NEXT: v_mov_b32_e32 v2, v43 -; GFX10-NEXT: v_mov_b32_e32 v3, v44 +; GFX10-NEXT: v_mov_b32_e32 v0, v40 +; GFX10-NEXT: v_mov_b32_e32 v1, v41 +; GFX10-NEXT: v_mov_b32_e32 v2, v42 +; GFX10-NEXT: v_mov_b32_e32 v3, v43 ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v40, 2 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 +; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: v_readlane_b32 s4, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -127,16 +127,16 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART @@ -145,29 +145,29 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v44, s0, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v44, s30, 0 +; GFX11-NEXT: v_writelane_b32 v44, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 -; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 +; GFX11-NEXT: v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41 +; GFX11-NEXT: v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v44, 1 +; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -204,43 +204,43 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v45, v16 -; GFX9-NEXT: v_mov_b32_e32 v44, v15 -; GFX9-NEXT: v_mov_b32_e32 v43, v14 -; GFX9-NEXT: v_mov_b32_e32 v42, v13 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v44, v16 +; GFX9-NEXT: v_mov_b32_e32 v43, v15 +; GFX9-NEXT: v_mov_b32_e32 v42, v14 +; GFX9-NEXT: v_mov_b32_e32 v41, v13 +; GFX9-NEXT: v_mov_b32_e32 v40, v12 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: v_writelane_b32 v45, s4, 2 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v45, s30, 0 +; GFX9-NEXT: v_writelane_b32 v45, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v45, 1 +; GFX9-NEXT: v_readlane_b32 s30, v45, 0 +; GFX9-NEXT: v_readlane_b32 s4, v45, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -253,44 +253,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s4, 2 +; GFX10-NEXT: v_writelane_b32 v45, s4, 2 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v16 +; GFX10-NEXT: v_mov_b32_e32 v40, v16 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v42, v15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 -; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: v_writelane_b32 v45, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v42, v14 +; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_writelane_b32 v45, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v40, 2 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: v_readlane_b32 s31, v45, 1 +; GFX10-NEXT: v_readlane_b32 s30, v45, 0 +; GFX10-NEXT: v_readlane_b32 s4, v45, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -304,42 +304,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v45, s0, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 +; GFX11-NEXT: v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 -; GFX11-NEXT: v_mov_b32_e32 v45, v12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v45, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13 +; GFX11-NEXT: v_mov_b32_e32 v44, v12 +; GFX11-NEXT: v_writelane_b32 v45, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: v_readlane_b32 s31, v45, 1 +; GFX11-NEXT: v_readlane_b32 s30, v45, 0 +; GFX11-NEXT: v_readlane_b32 s0, v45, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index ab7d3ca0ab425..e79cb66dcd776 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -371,13 +371,13 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GFX9-O0-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v2, s31, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 @@ -393,9 +393,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b32 s38, s43 ; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_getpc_b64 s[42:43] @@ -405,18 +405,18 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 s[44:45], s[0:1] ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v3 +; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 @@ -430,33 +430,33 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 -; GFX9-O3-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called@rel32@hi+12 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 @@ -562,12 +562,12 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -581,8 +581,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 ; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v8, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v8, s31, 1 +; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 @@ -606,11 +606,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 @@ -618,10 +618,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[9:10] +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9] ; GFX9-O0-NEXT: s_getpc_b64 s[34:35] ; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -651,8 +651,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -660,15 +660,15 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v8, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 ; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload @@ -689,50 +689,50 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_mov_b32 s40, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v6, s30, 0 +; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 -; GFX9-O3-NEXT: v_writelane_b32 v6, s31, 1 +; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: v_readlane_b32 s31, v6, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v6, 0 +; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 +; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload @@ -922,64 +922,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v32, s64, 0 -; GFX9-O0-NEXT: v_writelane_b32 v32, s65, 1 -; GFX9-O0-NEXT: v_writelane_b32 v32, s66, 2 -; GFX9-O0-NEXT: v_writelane_b32 v32, s67, 3 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v47, s64, 0 +; GFX9-O0-NEXT: v_writelane_b32 v47, s65, 1 +; GFX9-O0-NEXT: v_writelane_b32 v47, s66, 2 +; GFX9-O0-NEXT: v_writelane_b32 v47, s67, 3 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -987,147 +987,147 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46 -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45 -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44 -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46 +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45 +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44 +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42 +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44 +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1157,55 +1157,55 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v34, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v37, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v39, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v40, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v41, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s35 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33 +; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v32 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v35 +; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v37 +; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v39 +; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v42 -; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 @@ -1233,20 +1233,19 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27 ; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28 ; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-O0-NEXT: v_readlane_b32 s67, v32, 3 -; GFX9-O0-NEXT: v_readlane_b32 s66, v32, 2 -; GFX9-O0-NEXT: v_readlane_b32 s65, v32, 1 -; GFX9-O0-NEXT: v_readlane_b32 s64, v32, 0 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s67, v47, 3 +; GFX9-O0-NEXT: v_readlane_b32 s66, v47, 2 +; GFX9-O0-NEXT: v_readlane_b32 s65, v47, 1 +; GFX9-O0-NEXT: v_readlane_b32 s64, v47, 0 +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -1256,7 +1255,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]