Skip to content

Commit

Permalink
For PAL, make sure Scratch Buffer Descriptor do not clobber GIT pointer
Browse files Browse the repository at this point in the history
Since SRSRC has alignment requirements, first find non GIT pointer clobbered
registers for SRSRC and then if those registers clobber preloaded Scratch Wave
Offset register, copy the Scratch Wave Offset register to a free SGPR.
  • Loading branch information
RamNalamothu authored and arsenm committed May 6, 2020
1 parent 2f1fe18 commit f7060f4
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 63 deletions.
106 changes: 58 additions & 48 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Expand Up @@ -30,6 +30,11 @@ static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
ST.getMaxNumSGPRs(MF) / 4);
}

static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
}

// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
Expand Down Expand Up @@ -257,7 +262,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(

// Shift down registers reserved for the scratch RSRC.
Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
MachineFunction &MF, Register ScratchWaveOffsetReg) const {
MachineFunction &MF) const {

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
Expand All @@ -269,9 +274,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(

Register ScratchRsrcReg = MFI->getScratchRSrcReg();

if (ScratchRsrcReg == AMDGPU::NoRegister ||
!MRI.isPhysRegUsed(ScratchRsrcReg))
return AMDGPU::NoRegister;
if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
return Register();

if (ST.hasSGPRInitBug() ||
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
Expand All @@ -292,17 +296,13 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(

// Skip the last N reserved elements because they should have already been
// reserved for VCC etc.
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR128s) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
//
// FIXME: The preloaded SGPR count is not accurate for shaders as the
// scratch wave offset may be in a fixed SGPR or
// SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the
// scratch wave offset. We explicitly avoid the scratch wave offset to
// account for this.
// reserved input we needed. Also for PAL, make sure we don't clobber
// the GIT pointer passed in SGPR0 or SGPR8.
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) {
!TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
Expand Down Expand Up @@ -330,28 +330,28 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();

assert(MFI->isEntryFunction());

Register ScratchWaveOffsetReg = MFI->getPreloadedReg(
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
// FIXME: Hack to not crash in situations which emitted an error.
if (ScratchWaveOffsetReg == AMDGPU::NoRegister)
if (!PreloadedScratchWaveOffsetReg)
return;

// We need to do the replacement of the private segment buffer register even
// if there are no stack objects. There could be stores to undef or a
// constant without an associated object.
//
// This will return `AMDGPU::NoRegister` in cases where there are no actual
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
Register ScratchRsrcReg =
getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg);
Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);

// Make the selected register live throughout the function.
if (ScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg) {
for (MachineBasicBlock &OtherBB : MF) {
if (&OtherBB != &MBB) {
OtherBB.addLiveIn(ScratchRsrcReg);
Expand All @@ -361,12 +361,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,

// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg = AMDGPU::NoRegister;
Register PreloadedScratchRsrcReg;
if (ST.isAmdHsaOrMesa(F)) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg != AMDGPU::NoRegister &&
PreloadedScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
// We added live-ins during argument lowering, but since they were not
// used they were deleted. We're adding the uses now, so add them back.
MRI.addLiveIn(PreloadedScratchRsrcReg);
Expand All @@ -379,6 +378,32 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();

// We found the SRSRC first because it needs four registers and has an
// alignment requirement. If the SRSRC that we found is clobbering with
// the scratch wave offset, which may be in a fixed SGPR or a free SGPR
// chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
// wave offset to a free SGPR.
Register ScratchWaveOffsetReg;
if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
AllSGPRs = AllSGPRs.slice(
std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPRs) {
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
ScratchWaveOffsetReg = Reg;
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
break;
}
}
} else {
ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
}
assert(ScratchWaveOffsetReg);

if (MF.getFrameInfo().hasCalls()) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
Expand All @@ -392,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}

if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) {
MRI.addLiveIn(ScratchWaveOffsetReg);
MBB.addLiveIn(ScratchWaveOffsetReg);
if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}

if (MFI->hasFlatScratchInit()) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}

if (ScratchRsrcReg != AMDGPU::NoRegister) {
if (ScratchRsrcReg) {
emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
Expand Down Expand Up @@ -437,19 +462,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
if (ST.hasMergedShaders()) {
switch (MF.getFunction().getCallingConv()) {
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
// Low GIT address is passed in s8 rather than s0 for an LS+HS or
// ES+GS merged shader on gfx9+.
GitPtrLo = AMDGPU::SGPR8;
break;
default:
break;
}
}
Register GitPtrLo = MFI->getGITPtrLoReg(MF);
MF.getRegInfo().addLiveIn(GitPtrLo);
MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
Expand All @@ -475,8 +488,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(0) // dlc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
} else if (ST.isMesaGfxShader(Fn) ||
(PreloadedScratchRsrcReg == AMDGPU::NoRegister)) {
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

Expand Down Expand Up @@ -537,7 +549,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister);
assert(PreloadedScratchRsrcReg);

if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
Expand Down Expand Up @@ -650,7 +662,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Reg.FI.getValue());
}

if (ScratchExecCopy != AMDGPU::NoRegister) {
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Expand All @@ -659,7 +671,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.addReg(ScratchExecCopy);
}


if (FuncInfo->FramePointerSaveIndex) {
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
assert(!MFI.isDeadObjectIndex(FI) &&
Expand Down Expand Up @@ -690,8 +701,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,

Register ScratchSPReg = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
assert(ScratchSPReg != AMDGPU::NoRegister &&
ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);

// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
Expand Down Expand Up @@ -785,7 +795,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
continue;

const SIRegisterInfo &TRI = TII->getRegisterInfo();
if (ScratchExecCopy == AMDGPU::NoRegister) {
if (!ScratchExecCopy) {
// See emitPrologue
if (LiveRegs.empty()) {
LiveRegs.init(*ST.getRegisterInfo());
Expand All @@ -809,7 +819,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
}

if (ScratchExecCopy != AMDGPU::NoRegister) {
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Expand Down Expand Up @@ -991,7 +1001,7 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(

for (auto &CS : CSI) {
if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
if (FuncInfo->SGPRForFPSaveRestoreCopy)
CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
break;
}
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.h
Expand Up @@ -61,9 +61,7 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
const DebugLoc &DL,
Register ScratchWaveOffsetReg) const;

Register
getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF,
Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;

void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
Expand Down
21 changes: 21 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Expand Up @@ -439,6 +439,27 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}

Register
SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.isAmdPalOS())
return Register();
Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
if (ST.hasMergedShaders()) {
switch (MF.getFunction().getCallingConv()) {
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
// Low GIT address is passed in s8 rather than s0 for an LS+HS or
// ES+GS merged shader on gfx9+.
GitPtrLo = AMDGPU::SGPR8;
return GitPtrLo;
default:
return GitPtrLo;
}
}
return GitPtrLo;
}

static yaml::StringValue regToString(Register Reg,
const TargetRegisterInfo &TRI) {
yaml::StringValue Dest;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Expand Up @@ -676,6 +676,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
return GITPtrHigh;
}

Register getGITPtrLoReg(const MachineFunction &MF) const;

uint32_t get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}
Expand Down
48 changes: 48 additions & 0 deletions llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
@@ -0,0 +1,48 @@
# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -run-pass=prologepilog -o - %s | FileCheck %s

# On PAL, we need to ensure SRSRC do not clobber GIT pointer, passed
# in SGPR8 for HS or GS

--- |

define amdgpu_gs void @shader(i32 inreg %mergedGroupInfo) {
ret void
}
...
---
name: shader
tracksRegLiveness: true
liveins:
- { reg: '$sgpr0' }
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103'
stackPtrOffsetReg: '$sgpr32'
argumentInfo:
privateSegmentWaveByteOffset: { reg: '$sgpr5' }
body: |
; CHECK: $sgpr1 = COPY killed $sgpr5
; CHECK: $sgpr4_sgpr5 = S_GETPC_B64
; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4)
bb.0:
successors: %bb.1, %bb.2
liveins: $sgpr0
$exec_lo = S_MOV_B32 -1
renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
renamable $sgpr0 = S_BFE_U32 killed renamable $sgpr0, 589836, implicit-def dead $scc
renamable $vcc_lo = V_CMP_GT_U32_e64 killed $sgpr0, killed $vgpr0, implicit $exec
$vcc_hi = IMPLICIT_DEF
$sgpr0 = S_AND_SAVEEXEC_B32 $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
S_CBRANCH_EXECZ %bb.2, implicit $exec
S_BRANCH %bb.1
bb.1:
renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
bb.2:
S_ENDPGM 0
...

0 comments on commit f7060f4

Please sign in to comment.