Skip to content

Commit

Permalink
[AMDGPU] Pick available high VGPR for CSR SGPR spilling (#78669)
Browse files Browse the repository at this point in the history
CSR SGPR spilling currently uses the early available physical VGPRs. It
currently imposes a high register pressure while trying to allocate
large VGPR tuples within the default register budget.

This patch changes the spilling strategy by picking the VGPRs in the
reverse order, the highest available VGPR first and later after regalloc
shift them back to the lowest available range. With that, the initial
VGPRs would be available for allocation and possibility
of finding large number of contiguous registers will be more.
  • Loading branch information
cdevadas committed Jan 24, 2024
1 parent 7e50f00 commit 230c13d
Show file tree
Hide file tree
Showing 31 changed files with 4,513 additions and 4,231 deletions.
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister(
TargetStackID::SGPRSpill);

if (TRI->spillSGPRToVGPR() &&
MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
/*IsPrologEpilog=*/true)) {
// 2: There's no free lane to spill, and no free register to save the
// SGPR, so we're forced to take another VGPR to use for the spill.
MFI->addToPrologEpilogSGPRSpills(
Expand Down Expand Up @@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
return;

MFI->shiftSpillPhysVGPRsToLowestRange(MF);

TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
if (MFI->isEntryFunction())
return;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// regalloc aware CFI generation to insert new CFIs along with the
// intermediate spills is implemented. There is no such support
// currently exist in the LLVM compiler.
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
if (FuncInfo->allocateSGPRSpillToVGPRLane(
MF, FI, /*SpillToPhysVGPRLane=*/true)) {
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS, true);
if (!Spilled)
Expand Down
54 changes: 43 additions & 11 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}

void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
MachineFunction &MF) {
const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) {
Register Reg = SpillPhysVGPRs[I];
Register NewReg =
TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (!NewReg || NewReg >= Reg)
break;

MRI.replaceRegWith(Reg, NewReg);

// Update various tables with the new VGPR.
SpillPhysVGPRs[I] = NewReg;
WWMReservedRegs.remove(Reg);
WWMReservedRegs.insert(NewReg);
WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
WWMSpills.erase(Reg);

for (MachineBasicBlock &MBB : MF) {
MBB.removeLiveIn(Reg);
MBB.sortUniqueLiveIns();
}
}
}

bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Expand All @@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
}

bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
// Find the highest available register if called before RA to ensure the
// lowest registers are available for allocation. The LaneVGPR, in that
// case, will be shifted back to the lowest range after VGPR allocation.
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
!IsPrologEpilog);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
Expand All @@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
return true;
}

bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
int FI,
bool IsPrologEpilog) {
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
: SGPRSpillsToVirtualVGPRLanes[FI];
SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
: SGPRSpillsToVirtualVGPRLanes[FI];

// This has already been allocated.
if (!SpillLanes.empty())
Expand All @@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
"not spilling SGPRs to VGPRs");

unsigned &NumSpillLanes =
IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
: NumVirtualVGPRSpillLanes;

for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);

bool Allocated = IsPrologEpilog
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
bool Allocated = SpillToPhysVGPRLane
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
IsPrologEpilog)
: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
unsigned LaneIndex);
bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
unsigned LaneIndex);
unsigned LaneIndex,
bool IsPrologEpilog);

public:
Register getVGPRForAGPRCopy() const {
Expand Down Expand Up @@ -588,6 +589,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
}

ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }

const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }

Expand Down Expand Up @@ -702,7 +704,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
I->second.IsDead = true;
}

// To bring the Physical VGPRs in the highest range allocated for CSR SGPR
// spilling into the lowest available range.
void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);

bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
bool SpillToPhysVGPRLane = false,
bool IsPrologEpilog = false);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);

Expand Down

0 comments on commit 230c13d

Please sign in to comment.