diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 301996847a584..6b2417143ca06 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5530,13 +5530,9 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies Instead the flat SCRATCH instructions are used. Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs -that are used as a V# to access scratch. -The compiler synthesizes the initialization value for the Private Segment -Buffer in the kernel prologue, using the Flat Scratch Init to initialize low -64-bit and a known constant for the high ones. If the Flat Scratch Init is not -available, CP uses the value provided by the runtime. It is used, together with -Scratch Wavefront Offset as an offset, to access the private memory space using -a segment address. See +that are used as a V# to access scratch. CP uses the value provided by the +runtime. It is used, together with Scratch Wavefront Offset as an offset, to +access the private memory space using a segment address. See :ref:`amdgpu-amdhsa-initial-kernel-execution-state`. The scratch V# is a four-aligned SGPR and always selected for the kernel as diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6327a818a12ed..d02aee71870ec 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -379,8 +379,7 @@ class PrologEpilogSGPRSpillBuilder { } // namespace llvm // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` -// and return the FlatScratchInit Register used -Register SIFrameLowering::emitEntryFunctionFlatScratchInit( +void SIFrameLowering::emitEntryFunctionFlatScratchInit( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -400,7 +399,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( Register FlatScrInitLo; Register FlatScrInitHi; - Register FlatScratchInitReg; if (ST.isAmdPalOS()) { // Extract the scratch offset from the descriptor in the GIT @@ -410,6 +408,7 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( // Find unused reg to load flat scratch init into MachineRegisterInfo &MRI = MF.getRegInfo(); + Register FlatScrInit = AMDGPU::NoRegister; ArrayRef AllSGPR64s = TRI->getAllSGPR64(MF); unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; AllSGPR64s = AllSGPR64s.slice( @@ -418,28 +417,16 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( for (MCPhysReg Reg : AllSGPR64s) { if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { - FlatScratchInitReg = Reg; + FlatScrInit = Reg; break; } } + assert(FlatScrInit && "Failed to find free register for scratch init"); - } else { - FlatScratchInitReg = - MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); - - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(FlatScratchInitReg); - MBB.addLiveIn(FlatScratchInitReg); - } - - assert(FlatScratchInitReg && "Failed to find free register for scratch init"); - - FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - - if (ST.isAmdPalOS()) { + FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); - buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg); + buildGitPtr(MBB, I, DL, TII, FlatScrInit); // We now have the GIT ptr - now get the scratch descriptor from the entry // at offset 0 (or offset 16 for a compute shader). @@ -454,8 +441,8 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; const GCNSubtarget &Subtarget = MF.getSubtarget(); unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); - BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg) - .addReg(FlatScratchInitReg) + BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) + .addReg(FlatScrInit) .addImm(EncodedOffset) // offset .addImm(0) // cpol .addMemOperand(MMO); @@ -463,9 +450,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( // Mask the offset in [47:0] of the descriptor const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi) - .addReg(FlatScrInitHi) - .addImm(0xffff); + .addReg(FlatScrInitHi) + .addImm(0xffff); And->getOperand(3).setIsDead(); // Mark SCC as dead. + } else { + Register FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + assert(FlatScratchInitReg); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); } // Do a 64-bit pointer add. @@ -488,21 +486,20 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( addReg(FlatScrInitHi). addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); - return FlatScratchInitReg; + return; } - assert(ST.getGeneration() == AMDGPUSubtarget::GFX9); - + // For GFX9. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitLo) - .addReg(ScratchWaveOffsetReg); + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) .addReg(FlatScrInitHi) .addImm(0); Addc->getOperand(3).setIsDead(); // Mark SCC as dead. - return AMDGPU::FLAT_SCR; + return; } assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); @@ -523,7 +520,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit( .addReg(FlatScrInitLo, RegState::Kill) .addImm(8); LShr->getOperand(3).setIsDead(); // Mark SCC as dead. - return AMDGPU::FLAT_SCR; } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not @@ -615,15 +611,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); assert(MFI->isEntryFunction()); - bool NeedsFlatScratchInit = - MFI->getUserSGPRInfo().hasFlatScratchInit() && - (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || - (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); - Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); @@ -649,7 +641,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Now that we have fixed the reserved SRSRC we need to locate the // (potentially) preloaded SRSRC. Register PreloadedScratchRsrcReg; - if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) { + if (ST.isAmdHsaOrMesa(F)) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); if (ScratchRsrcReg && PreloadedScratchRsrcReg) { @@ -705,30 +697,33 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } + bool NeedsFlatScratchInit = + MFI->getUserSGPRInfo().hasFlatScratchInit() && + (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || + (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + if ((NeedsFlatScratchInit || ScratchRsrcReg) && PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - Register FlatScratchInit; if (NeedsFlatScratchInit) { - FlatScratchInit = - emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } if (ScratchRsrcReg) { - emitEntryFunctionScratchRsrcRegSetup( - MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg, - PreloadedScratchRsrcReg, ScratchWaveOffsetReg); + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg, - Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -776,8 +771,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(21) .addReg(Rsrc03); } - } else if (ST.isMesaGfxShader(Fn) || - (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) { + } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -836,26 +830,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else if (ST.isAmdHsaOrMesa(Fn)) { - - if (FlatScratchInit) { - const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), - TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1)) - .addReg(FlatScratchInit) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - BuildMI(MBB, I, DL, SMovB32, Lo_32) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, SMovB32, Hi_32) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - return; - } - assert(PreloadedScratchRsrcReg); if (ScratchRsrcReg != PreloadedScratchRsrcReg) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index f706d48b2dc10..b3feb759ed811 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering { MachineBasicBlock::iterator MI) const override; private: - Register - emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, - Register ScratchWaveOffsetReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; void emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register FlatScratchInit, Register ScratchRsrcReg, - Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const; + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 61bc28b70ee72..24652982c6584 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -13,11 +13,10 @@ define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_mov_b32 s32, 0 -; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 @@ -62,10 +61,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; MUBUF-NEXT: s_mov_b32 s2, -1 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_mov_b32 s3, 0xe00000 -; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch +; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 609b5e6f49ef1..a439c0f51ffe9 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -48,20 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1 -; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000 +; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch +; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2 ; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0 +; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 @@ -71,20 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-GISEL: ; %bb.0: ; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9 -; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1 -; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000 +; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0 +; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] ; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 ; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index 74c6bb599cb9b..7c8d40c49bb80 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -10,9 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index c06f213b9eb66..5a128c7541d1e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -5,14 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-LABEL: name: f1 ; GFX90A: bb.0.bb: ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0 ; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc ; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc - ; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index f72d22bb5d75c..863bd0d8c7529 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -129,13 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i1_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 @@ -235,9 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 @@ -341,9 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 @@ -425,13 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i8_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 @@ -529,9 +525,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 @@ -630,9 +625,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 @@ -713,13 +707,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 @@ -816,9 +809,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 @@ -917,9 +909,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 @@ -1000,13 +991,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 @@ -1088,14 +1078,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 @@ -1193,13 +1182,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 @@ -1290,16 +1278,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 @@ -1404,13 +1391,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: s_mov_b32 s32, 0 @@ -1528,13 +1514,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_mov_b32 s4, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_mov_b32 s5, s4 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: v_mov_b32_e32 v6, 3 @@ -1620,13 +1605,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 @@ -1705,13 +1689,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 @@ -1793,14 +1776,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 @@ -1886,15 +1868,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 @@ -1987,17 +1968,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5f32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: v_mov_b32_e32 v3, -1.0 ; HSA-NEXT: v_mov_b32_e32 v4, 0.5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 @@ -2079,14 +2059,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 @@ -2175,16 +2154,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 ; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 @@ -2280,11 +2258,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f64_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -2292,6 +2268,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v4, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 @@ -2380,15 +2357,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; HSA-LABEL: test_call_external_void_func_v2i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 @@ -2480,15 +2456,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2581,15 +2556,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2673,14 +2647,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 3 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 @@ -2764,14 +2737,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v3f16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 @@ -2863,15 +2835,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -2957,14 +2928,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i16_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 @@ -3055,15 +3025,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; HSA-LABEL: test_call_external_void_func_v2f16: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 @@ -3151,15 +3120,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3242,14 +3210,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v2i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 @@ -3335,15 +3302,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 @@ -3432,16 +3398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; HSA-LABEL: test_call_external_void_func_v3i32_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: v_mov_b32_e32 v3, 6 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 @@ -3528,15 +3493,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3626,16 +3590,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v4i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -3728,17 +3691,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v5i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: v_mov_b32_e32 v4, 5 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 @@ -3841,14 +3803,13 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 @@ -3954,11 +3915,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-LABEL: test_call_external_void_func_v8i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 @@ -3968,6 +3927,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v6, 7 ; HSA-NEXT: v_mov_b32_e32 v7, 8 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -4078,6 +4038,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4085,9 +4046,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 ; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 @@ -4224,6 +4183,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4235,10 +4195,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 @@ -4401,10 +4359,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -4509,15 +4466,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; HSA-LABEL: test_call_external_i32_func_i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_mov_b32 s39, 0x1100f000 ; HSA-NEXT: s_mov_b32 s38, -1 ; HSA-NEXT: s_getpc_b64 s[4:5] @@ -4625,14 +4581,13 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 ; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 @@ -4747,11 +4702,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4759,6 +4712,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x400 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 @@ -4923,11 +4877,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; HSA-NEXT: v_mov_b32_e32 v0, 8 @@ -4935,6 +4887,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x800 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -5132,13 +5085,12 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s7 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 @@ -5387,15 +5339,14 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: ; HSA: ; %bb.0: ; %entry ; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80 ; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; HSA-NEXT: s_mov_b32 s2, -1 -; HSA-NEXT: s_mov_b32 s3, 0x11e80000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: v_mov_b32_e32 v0, s23 ; HSA-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 8e2fca554e28c..c62a082459105 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -11,11 +11,10 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -32,10 +31,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -52,10 +50,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -72,10 +69,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -92,10 +88,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -113,10 +108,9 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 6db5effdf04ed..616e5f00fc1e5 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -7,13 +7,12 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 @@ -31,11 +30,10 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v0, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -54,12 +52,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -77,12 +74,11 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) # define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -103,13 +99,12 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 0705d493d65d7..9f535a94e61f6 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI: v_or_b32_e32 v31, v0, v1 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI: v_or_b32_e32 v31, v0, v1 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { ; FIXEDABI-NOT: v2 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI: v_or_b32_e32 v31, v1, v0 +; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 718888391906f..6f42fd0aff135 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -68,14 +68,13 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -89,12 +88,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -114,12 +112,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -151,14 +148,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -175,12 +171,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -204,11 +199,10 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -317,14 +311,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -339,12 +332,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -366,12 +358,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_getpc_b64 s[16:17] ; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 @@ -422,15 +413,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX803-NEXT: s_mov_b32 s3, 0x11e80000 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 @@ -447,13 +437,12 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_mov_b32 s2, -1 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_mov_b32 s3, 0xe00000 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 @@ -478,11 +467,10 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_mov_b32 s2, -1 -; GFX1010-NEXT: s_mov_b32 s3, 0x31c16000 -; GFX1010-NEXT: s_mov_b64 s[0:1], s[10:11] -; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 68c632a0bf6f4..11871db1ef656 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -180,9 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 @@ -230,9 +229,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xe00000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 2d019efe2417a..47110d9491887 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -13,6 +13,8 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -23,17 +25,14 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, 1, s7 -; GFX9-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-NEXT: s_cmp_eq_u32 s4, 1 -; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_cselect_b32 s5, s13, s11 ; GFX9-NEXT: s_cselect_b32 s4, s12, s10 -; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index a66ed939fef60..408199bbc9223 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -12,9 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -38,9 +37,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 -; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] @@ -69,9 +67,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_getpc_b64 s[14:15] @@ -96,9 +93,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GISEL-NEXT: s_add_i32 s12, s12, s17 ; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_mov_b32 s2, -1 -; GISEL-NEXT: s_mov_b32 s3, 0x1e8f000 -; GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GISEL-NEXT: s_mov_b32 s13, s15 ; GISEL-NEXT: s_mov_b32 s12, s14 ; GISEL-NEXT: s_getpc_b64 s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 8843efd2c3c79..6e905542ce53c 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -11,9 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b32 s33, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 4851c4f73456a..66f31bbf7afe0 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -118,11 +118,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -178,11 +177,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -238,11 +236,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -298,11 +295,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 @@ -345,6 +341,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -353,9 +351,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -375,15 +370,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -416,6 +410,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -424,9 +420,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 2 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -446,15 +439,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -487,6 +479,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -495,9 +489,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 1 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -517,15 +508,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -558,6 +548,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -566,9 +558,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s15, 3 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -588,15 +577,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_add_u32 s8, s4, 8 ; CHECK-NEXT: s_addc_u32 s9, s5, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 26271a0a68652..61818dafd2b84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -45,9 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index f780188deaec1..bb7c43f76c8a1 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -164,9 +164,8 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -199,9 +198,8 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -242,9 +240,8 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index fa4b93fd1d6b7..4d73436c519bd 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -229,9 +229,8 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 @@ -269,9 +268,8 @@ define amdgpu_kernel void @k23() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 @@ -312,9 +310,8 @@ define amdgpu_kernel void @k123() { ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GCN-NEXT: s_add_i32 s6, s6, s9 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x1e8f000 -; GCN-NEXT: s_mov_b64 s[0:1], flat_scratch +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index e17f311b11d8b..138a6a86cee98 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -44,18 +44,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s35, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 @@ -782,18 +781,17 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 ; CHECK-NEXT: s_add_u32 s42, s36, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s43, s37, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] ; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_mov_b32 s40, s13 ; CHECK-NEXT: s_mov_b32 s41, s12 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 70a9bbbd47a3e..f70441e87a74b 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -69,9 +69,8 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -129,9 +128,8 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -242,9 +240,8 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0xe00000 -; CHECK-NEXT: s_mov_b64 s[0:1], flat_scratch +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e6d9c0d6105f5..e7c5aaf043efb 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s3, 0xe00000 +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: s_mul_i32 s4, s4, s5 @@ -55,9 +55,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_mov_b64 s[0:1], flat_scratch -; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 ; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b64 v0, v[3:4] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 8d8459ff0b1b2..1118cc3b16463 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -45,8 +45,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) @@ -71,10 +73,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s3, 0xe00000 -; GLOBALNESS1-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS1-NEXT: s_mov_b32 s68, s14 ; GLOBALNESS1-NEXT: s_mov_b32 s69, s13 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s12 @@ -333,8 +332,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) @@ -359,10 +360,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: s_mov_b32 s2, -1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s3, 0xe00000 -; GLOBALNESS0-NEXT: s_mov_b64 s[0:1], flat_scratch ; GLOBALNESS0-NEXT: s_mov_b32 s66, s14 ; GLOBALNESS0-NEXT: s_mov_b32 s67, s13 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s12 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 7d759089a7c0c..7840559c78eb6 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -14,9 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s3, 0x31c16000 -; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11] +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42