Skip to content

Commit

Permalink
[AMDGPU] Callee must always spill writelane VGPRs
Browse files Browse the repository at this point in the history
Since the writelane instruction used for SGPR spills can
modify inactive lanes, the callee must preserve the VGPR
this instruction modifies even if it was marked Caller-saved.

Reviewed By: arsenm, nhaehnle

Differential Revision: https://reviews.llvm.org/D124192
  • Loading branch information
cdevadas committed Dec 17, 2022
1 parent cc037e1 commit 5692a7e
Show file tree
Hide file tree
Showing 17 changed files with 455 additions and 387 deletions.
57 changes: 40 additions & 17 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Expand Up @@ -98,10 +98,12 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
// 3: There's no free lane to spill, and no free register to save FP/BP,
// so we're forced to spill another VGPR to use for the spill.
auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
MFI->allocateWWMSpill(MF, Spill.VGPR);

FrameIndex = NewFI;

LLVM_DEBUG(
auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
} else {
Expand Down Expand Up @@ -777,17 +779,14 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
std::optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;

// VGPRs used for SGPR->VGPR spills
for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI)
continue;

for (const auto &Reg : FuncInfo->getWWMSpills()) {
Register VGPR = Reg.first;
int FI = Reg.second;
if (!ScratchExecCopy)
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
/*IsProlog*/ true);

buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, Reg.VGPR,
*Reg.FI);
buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR, FI);
}

for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
Expand Down Expand Up @@ -1054,17 +1053,15 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
}

Register ScratchExecCopy;
for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI)
continue;

for (const auto &Reg : FuncInfo->getWWMSpills()) {
Register VGPR = Reg.first;
int FI = Reg.second;
if (!ScratchExecCopy)
ScratchExecCopy =
buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false);

buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
Reg.VGPR, *Reg.FI);
buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
FI);
}

for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
Expand Down Expand Up @@ -1262,6 +1259,32 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();

for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
// WRITELANE instructions used for SGPR spills can overwrite the inactive
// lanes of VGPRs and callee must spill and restore them even if they are
// marked Caller-saved.

// TODO: Handle this elsewhere at an early point. Walking through all MBBs
// here would be a bad heuristic. A better way should be by calling
// allocateWWMSpill during the regalloc pipeline whenever a physical
// register is allocated for the intended virtual registers. That will
// also help excluding the general use of WRITELANE/READLANE intrinsics
// that won't really need any such special handling.
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
}
}

for (MachineBasicBlock &MBB : MF) {
for (auto &Reg : MFI->getWWMSpills())
MBB.addLiveIn(Reg.first);

MBB.sortUniqueLiveIns();
}

// Ignore the SGPRs the default implementation found.
SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());

Expand All @@ -1285,8 +1308,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,

// VGPRs used for SGPR spilling need to be specially inserted in the prolog,
// so don't allow the default insertion to handle them.
for (auto SSpill : MFI->getSGPRSpillVGPRs())
SavedVGPRs.reset(SSpill.VGPR);
for (auto &Reg : MFI->getWWMSpills())
SavedVGPRs.reset(Reg.first);

LivePhysRegs LiveRegs;
LiveRegs.init(*TRI);
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
Expand Up @@ -309,8 +309,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {

// FIXME: Adding to live-ins redundant with reserving registers.
for (MachineBasicBlock &MBB : MF) {
for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(SSpill.VGPR);
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(Reg);
MBB.sortUniqueLiveIns();

// FIXME: The dead frame indices are replaced with a null register from
Expand Down
21 changes: 12 additions & 9 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Expand Up @@ -270,6 +270,16 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
return ArgInfo.LDSKernelId.getRegister();
}

void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
uint64_t Size, Align Alignment) {
// Skip if it is an entry function or the register is already added.
if (isEntryFunction() || WWMSpills.count(VGPR))
return;

WWMSpills.insert(std::make_pair(
VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
}

bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
MCPhysReg Reg) {
for (unsigned I = 0; CSRegs[I]; ++I) {
Expand Down Expand Up @@ -340,21 +350,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return false;
}

std::optional<int> SpillFI;
// We need to preserve inactive lanes, so always save, even caller-save
// registers.
if (!isEntryFunction()) {
SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
}

SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
SpillVGPRs.push_back(LaneVGPR);

// Add this register as live-in to all blocks to avoid machine verifier
// complaining about use of an undefined physical register.
for (MachineBasicBlock &BB : MF)
BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back().VGPR;
LaneVGPR = SpillVGPRs.back();
}

SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex));
Expand Down
27 changes: 14 additions & 13 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Expand Up @@ -431,17 +431,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
MCPhysReg getNextSystemSGPR() const;

public:
struct SGPRSpillVGPR {
// VGPR used for SGPR spills
Register VGPR;

// If the VGPR is used for SGPR spills in a non-entrypoint function, the
// stack slot used to save/restore it in the prolog/epilog.
std::optional<int> FI;

SGPRSpillVGPR(Register V, std::optional<int> F) : VGPR(V), FI(F) {}
};

struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
bool FullyAllocated = false;
Expand Down Expand Up @@ -471,7 +460,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// frameindex key.
DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
SmallVector<Register, 2> SpillVGPRs;
using WWMSpillsMap = MapVector<Register, int>;
// To track the registers used in instructions that can potentially modify the
// inactive lanes. The WWM instructions and the writelane instructions for
// spilling SGPRs to VGPRs fall under such category of operations. The VGPRs
// modified by them should be spilled/restored at function prolog/epilog to
// avoid any undesired outcome. Each entry in this map holds a pair of values,
// the VGPR and its stack slot index.
WWMSpillsMap WWMSpills;

DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;

Expand Down Expand Up @@ -540,7 +537,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
: makeArrayRef(I->second);
}

ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }

void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
Align Alignment = Align(4));

ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
return SpillAGPR;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Expand Up @@ -711,8 +711,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
reserveRegisterTuples(Reserved, Reg);

for (auto SSpill : MFI->getSGPRSpillVGPRs())
reserveRegisterTuples(Reserved, SSpill.VGPR);
for (auto Reg : MFI->getSGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);

return Reserved;
}
Expand Down
48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
Expand Up @@ -8,7 +8,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: s_or_saveexec_b32 s4, -1
; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 exec_lo, s4
; CHECK-NEXT: v_mov_b32_e32 v15, v1
; CHECK-NEXT: v_mov_b32_e32 v14, v2
Expand All @@ -25,14 +25,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: v_mov_b32_e32 v5, v11
; CHECK-NEXT: v_mov_b32_e32 v6, v10
; CHECK-NEXT: v_mov_b32_e32 v7, v9
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s4, s8
; CHECK-NEXT: s_mov_b32 s5, s8
Expand All @@ -47,19 +47,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_mov_b32 s5, s6
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: v_writelane_b32 v8, s4, 4
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v7, v9
; CHECK-NEXT: v_mov_b32_e32 v6, v10
Expand Down Expand Up @@ -115,8 +115,8 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
; CHECK-NEXT: v_writelane_b32 v8, s4, 13
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s4, v8, 13
; CHECK-NEXT: v_readlane_b32 s8, v8, 5
; CHECK-NEXT: v_readlane_b32 s9, v8, 6
Expand All @@ -133,20 +133,20 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_readlane_b32 s4, v8, 4
; CHECK-NEXT: s_mov_b32 exec_lo, s4
; CHECK-NEXT: ; %bb.4:
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v2, s4
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: s_or_saveexec_b32 s4, -1
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 exec_lo, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
Expand Down

0 comments on commit 5692a7e

Please sign in to comment.