diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index c0edd0e09c746..44095ad611384 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -30,6 +30,11 @@ static ArrayRef getAllSGPR128(const GCNSubtarget &ST, ST.getMaxNumSGPRs(MF) / 4); } +static ArrayRef getAllSGPRs(const GCNSubtarget &ST, + const MachineFunction &MF) { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); +} + // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -257,7 +262,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( // Shift down registers reserved for the scratch RSRC. Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( - MachineFunction &MF, Register ScratchWaveOffsetReg) const { + MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -269,9 +274,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchRsrcReg)) - return AMDGPU::NoRegister; + if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) + return Register(); if (ST.hasSGPRInitBug() || ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) @@ -292,17 +296,13 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( // Skip the last N reserved elements because they should have already been // reserved for VCC etc. + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - // - // FIXME: The preloaded SGPR count is not accurate for shaders as the - // scratch wave offset may be in a fixed SGPR or - // SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the - // scratch wave offset. We explicitly avoid the scratch wave offset to - // account for this. + // reserved input we needed. Also for PAL, make sure we don't clobber + // the GIT pointer passed in SGPR0 or SGPR8. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) { + !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -330,28 +330,28 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); assert(MFI->isEntryFunction()); - Register ScratchWaveOffsetReg = MFI->getPreloadedReg( + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); // FIXME: Hack to not crash in situations which emitted an error. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) + if (!PreloadedScratchWaveOffsetReg) return; // We need to do the replacement of the private segment buffer register even // if there are no stack objects. There could be stores to undef or a // constant without an associated object. // - // This will return `AMDGPU::NoRegister` in cases where there are no actual + // This will return `Register()` in cases where there are no actual // uses of the SRSRC. - Register ScratchRsrcReg = - getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg); + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); // Make the selected register live throughout the function. - if (ScratchRsrcReg != AMDGPU::NoRegister) { + if (ScratchRsrcReg) { for (MachineBasicBlock &OtherBB : MF) { if (&OtherBB != &MBB) { OtherBB.addLiveIn(ScratchRsrcReg); @@ -361,12 +361,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Now that we have fixed the reserved SRSRC we need to locate the // (potentially) preloaded SRSRC. - Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; + Register PreloadedScratchRsrcReg; if (ST.isAmdHsaOrMesa(F)) { PreloadedScratchRsrcReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - if (ScratchRsrcReg != AMDGPU::NoRegister && - PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + if (ScratchRsrcReg && PreloadedScratchRsrcReg) { // We added live-ins during argument lowering, but since they were not // used they were deleted. We're adding the uses now, so add them back. MRI.addLiveIn(PreloadedScratchRsrcReg); @@ -379,6 +378,32 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); + // We found the SRSRC first because it needs four registers and has an + // alignment requirement. If the SRSRC that we found is clobbering with + // the scratch wave offset, which may be in a fixed SGPR or a free SGPR + // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch + // wave offset to a free SGPR. + Register ScratchWaveOffsetReg; + if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { + ArrayRef AllSGPRs = getAllSGPRs(ST, MF); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast(AllSGPRs.size()), NumPreloaded)); + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { + ScratchWaveOffsetReg = Reg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + break; + } + } + } else { + ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; + } + assert(ScratchWaveOffsetReg); + if (MF.getFrameInfo().hasCalls()) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); @@ -392,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) { - MRI.addLiveIn(ScratchWaveOffsetReg); - MBB.addLiveIn(ScratchWaveOffsetReg); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } if (MFI->hasFlatScratchInit()) { emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } - if (ScratchRsrcReg != AMDGPU::NoRegister) { + if (ScratchRsrcReg) { emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg, ScratchWaveOffsetReg); @@ -437,19 +462,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } - auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in - if (ST.hasMergedShaders()) { - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - // Low GIT address is passed in s8 rather than s0 for an LS+HS or - // ES+GS merged shader on gfx9+. - GitPtrLo = AMDGPU::SGPR8; - break; - default: - break; - } - } + Register GitPtrLo = MFI->getGITPtrLoReg(MF); MF.getRegInfo().addLiveIn(GitPtrLo); MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) @@ -475,8 +488,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); - } else if (ST.isMesaGfxShader(Fn) || - (PreloadedScratchRsrcReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -537,7 +549,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else if (ST.isAmdHsaOrMesa(Fn)) { - assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister); + assert(PreloadedScratchRsrcReg); if (ScratchRsrcReg != PreloadedScratchRsrcReg) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) @@ -650,7 +662,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -659,7 +671,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, LiveRegs.addReg(ScratchExecCopy); } - if (FuncInfo->FramePointerSaveIndex) { const int FI = FuncInfo->FramePointerSaveIndex.getValue(); assert(!MFI.isDeadObjectIndex(FI) && @@ -690,8 +701,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Register ScratchSPReg = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg != AMDGPU::NoRegister && - ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); + assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -785,7 +795,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, continue; const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (ScratchExecCopy == AMDGPU::NoRegister) { + if (!ScratchExecCopy) { // See emitPrologue if (LiveRegs.empty()) { LiveRegs.init(*ST.getRegisterInfo()); @@ -809,7 +819,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -991,7 +1001,7 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( for (auto &CS : CSI) { if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) + if (FuncInfo->SGPRForFPSaveRestoreCopy) CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); break; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 5bd3b0b6ca991..e894320406610 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -61,9 +61,7 @@ class SIFrameLowering final : public AMDGPUFrameLowering { const DebugLoc &DL, Register ScratchWaveOffsetReg) const; - Register - getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF, - Register ScratchWaveOffsetReg) const; + Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; void emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 58f66d3a394c0..104138ea0f058 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -439,6 +439,27 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } +Register +SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.isAmdPalOS()) + return Register(); + Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in + if (ST.hasMergedShaders()) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + // Low GIT address is passed in s8 rather than s0 for an LS+HS or + // ES+GS merged shader on gfx9+. + GitPtrLo = AMDGPU::SGPR8; + return GitPtrLo; + default: + return GitPtrLo; + } + } + return GitPtrLo; +} + static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI) { yaml::StringValue Dest; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index c6ccad800ccf3..ae04896bef72e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -676,6 +676,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { return GITPtrHigh; } + Register getGITPtrLoReg(const MachineFunction &MF) const; + uint32_t get32BitAddressHighBits() const { return HighBitsOf32BitAddress; } diff --git a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir new file mode 100644 index 0000000000000..054e022949d98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir @@ -0,0 +1,48 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -run-pass=prologepilog -o - %s | FileCheck %s + +# On PAL, we need to ensure SRSRC do not clobber GIT pointer, passed +# in SGPR8 for HS or GS + +--- | + + define amdgpu_gs void @shader(i32 inreg %mergedGroupInfo) { + ret void + } +... +--- +name: shader +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr5' } +body: | + ; CHECK: $sgpr1 = COPY killed $sgpr5 + ; CHECK: $sgpr4_sgpr5 = S_GETPC_B64 + ; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4) + bb.0: + successors: %bb.1, %bb.2 + liveins: $sgpr0 + + $exec_lo = S_MOV_B32 -1 + renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec + renamable $sgpr0 = S_BFE_U32 killed renamable $sgpr0, 589836, implicit-def dead $scc + renamable $vcc_lo = V_CMP_GT_U32_e64 killed $sgpr0, killed $vgpr0, implicit $exec + $vcc_hi = IMPLICIT_DEF + $sgpr0 = S_AND_SAVEEXEC_B32 $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + + bb.2: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index af933dc94d6e2..591f7bfe5aed1 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -14,14 +14,14 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s6, -1 -; SI-DAG: s_mov_b32 s7, 0xe8f000 -; VI-DAG: s_mov_b32 s7, 0xe80000 -; GFX9-DAG: s_mov_b32 s7, 0xe00000 -; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000 -; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s2, -1 +; SI-DAG: s_mov_b32 s3, 0xe8f000 +; VI-DAG: s_mov_b32 s3, 0xe80000 +; GFX9-DAG: s_mov_b32 s3, 0xe00000 +; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000 +; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-NOT: s_mov_b32 s0 @@ -39,7 +39,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { } ; GCN-LABEL: {{^}}vs_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -51,7 +51,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { } ; GCN-LABEL: {{^}}cs_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_cs float @cs_main(i32 %idx) { @@ -62,7 +62,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { } ; GCN-LABEL: {{^}}hs_main: -; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -79,7 +79,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { } ; GCN-LABEL: {{^}}gs_main: -; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen