826 changes: 494 additions & 332 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@
/// \file
/// This contains a MachineSchedStrategy implementation for maximizing wave
/// occupancy on GCN hardware.
///
/// This pass will apply multiple scheduling stages to the same function.
/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
/// entry point for the scheduling of those regions is
/// GCNScheduleDAGMILive::runSchedStages.

/// Generally, the reason for having multiple scheduling stages is to account
/// for the kernel-wide effect of register usage on occupancy. Usually, only a
/// few scheduling regions will have register pressure high enough to limit
/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
/// other regions.
///
//===----------------------------------------------------------------------===//

#include "GCNSchedStrategy.h"
Expand All @@ -20,9 +32,9 @@
using namespace llvm;

GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) :
GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
HasExcessPressure(false), MF(nullptr) { }
const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
HasClusteredNodes(false), HasExcessPressure(false) {}

void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
Expand Down Expand Up @@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
return SU;
}

GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S) :
ScheduleDAGMILive(C, std::move(S)),
ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {

LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}

void GCNScheduleDAGMILive::schedule() {
if (Stage == Collect) {
// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;
}

std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);
for (auto &I : *this) {
Unsched.push_back(&I);
}

GCNRegPressure PressureBefore;
if (LIS) {
PressureBefore = Pressure[RegionIdx];

LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
dbgs() << "Region live-in pressure: ";
llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
dbgs() << "Region register pressure: ";
PressureBefore.print(dbgs()));
}

GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
// Set HasClusteredNodes to true for late stages where we have already
// collected it. That way pickNode() will not scan SDep's when not needed.
S.HasClusteredNodes = Stage > InitialSchedule;
S.HasExcessPressure = false;
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false;
if (Stage == InitialSchedule && S.HasClusteredNodes)
RegionsWithClusters[RegionIdx] = true;
if (S.HasExcessPressure)
RegionsWithHighRP[RegionIdx] = true;

if (!LIS)
return;

// Check the results of scheduling.
auto PressureAfter = getRealRegPressure();

LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
PressureAfter.print(dbgs()));

if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == MinOccupancy;

LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}

unsigned WavesAfter =
std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
unsigned WavesBefore =
std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");

// We may not be able to keep the current target occupancy because of the just
// scheduled region. We might still be able to revert scheduling if the
// occupancy before was higher, or if the current schedule has register
// pressure higher than the excess limits which could lead to more spilling.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);

// Allow memory bound functions to drop to 4 waves if not limited by an
// attribute.
if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
WavesAfter >= MFI.getMinAllowedOccupancy()) {
LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
<< MFI.getMinAllowedOccupancy() << " waves\n");
NewOccupancy = WavesAfter;
}

if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;
MFI.limitOccupancy(MinOccupancy);
RegionsWithMinOcc.reset();
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< MinOccupancy << ".\n");
}

unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
PressureAfter.getAGPRNum() > MaxVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs) {
RescheduleRegions[RegionIdx] = true;
RegionsWithHighRP[RegionIdx] = true;
}

// If this condition is true, then either the occupancy before and after
// scheduling is the same, or we are allowing the occupancy to drop because
// the function is memory bound. Even if we are OK with the current occupancy,
// we still need to verify that we will not introduce any extra chance of
// spilling.
if (WavesAfter >= MinOccupancy) {
if (Stage == UnclusteredReschedule &&
!PressureAfter.less(ST, PressureBefore)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
} else if (WavesAfter > MFI.getMinWavesPerEU() ||
PressureAfter.less(ST, PressureBefore) ||
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == MinOccupancy;
if (!RegionsWithClusters[RegionIdx] &&
(Stage + 1) == UnclusteredReschedule)
RescheduleRegions[RegionIdx] = false;
return;
} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
}

RegionsWithMinOcc[RegionIdx] =
PressureBefore.getOccupancy(ST) == MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
(Stage + 1) != UnclusteredReschedule;
RegionEnd = RegionBegin;
int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr()) {
++SkippedDebugInstr;
continue;
}

if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
BB->insert(RegionEnd, MI);
if (!MI->isDebugInstr())
LIS->handleMove(*MI, true);
}
// Reset read-undef flags and update them later.
for (auto &Op : MI->operands())
if (Op.isReg() && Op.isDef())
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
if (!MI->isDebugInstr()) {
if (ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
} else {
// Adjust for missing dead-def flags.
RegOpers.detectDeadDefs(*MI, *LIS);
}
}
RegionEnd = MI->getIterator();
++RegionEnd;
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}

// After reverting schedule, debug instrs will now be at the end of the block
// and RegionEnd will point to the first debug instr. Increment RegionEnd
// pass debug instrs to the actual end of the scheduling region.
while (SkippedDebugInstr-- > 0)
++RegionEnd;

// If Unsched.front() instruction is a debug instruction, this will actually
// shrink the region since we moved all debug instructions to the end of the
// block. Find the first instruction that is not a debug instruction.
RegionBegin = Unsched.front()->getIterator();
if (RegionBegin->isDebugInstr()) {
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
continue;
RegionBegin = MI->getIterator();
break;
}
}

// Then move the debug instructions back into their correct place and set
// RegionBegin and RegionEnd if needed.
placeDebugValues();

Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
// Collect all scheduling regions. The actual scheduling is performed in
// GCNScheduleDAGMILive::finalizeSchedule.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
}

GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
GCNDownwardRPTracker RPTracker(*LIS);
RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}

void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);

// If the block has the only successor then live-ins of that successor are
Expand Down Expand Up @@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
RPTracker.reset(*I, &LRS);
}

for ( ; ; ) {
for (;;) {
I = RPTracker.getNext();

if (Regions[CurRegion].first == I || NonDbgMI == I) {
Expand Down Expand Up @@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
}

void GCNScheduleDAGMILive::finalizeSchedule() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");

// Start actual scheduling here. This function is called by the base
// MachineScheduler after all regions have been recorded by
// GCNScheduleDAGMILive::schedule().
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
Expand All @@ -601,153 +434,481 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
RegionsWithHighRP.reset();
RegionsWithMinOcc.reset();

runSchedStages();
}

void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
*this);
PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};

if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();

std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
for (auto *Stage : SchedStages) {
if (!Stage->initGCNSchedStage())
continue;

do {
Stage++;
RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;
for (auto Region : Regions) {
RegionBegin = Region.first;
RegionEnd = Region.second;
// Setup for scheduling the region and check whether it should be skipped.
if (!Stage->initGCNRegion()) {
Stage->advanceRegion();
exitRegion();
continue;
}

if (Stage > InitialSchedule) {
if (!LIS)
break;
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
}

// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().
Stage->finalizeGCNSchedStage();
}
}

if (Stage == UnclusteredReschedule) {
if (RescheduleRegions.none())
continue;
LLVM_DEBUG(dbgs() <<
"Retrying function scheduling without clustering.\n");
}
#ifndef NDEBUG
raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
switch (StageID) {
case GCNSchedStageID::InitialSchedule:
OS << "Initial Schedule";
break;
case GCNSchedStageID::UnclusteredReschedule:
OS << "Unclustered Reschedule";
break;
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
OS << "Clustered Low Occupancy Reschedule";
break;
case GCNSchedStageID::PreRARematerialize:
OS << "Pre-RA Rematerialize";
break;
}
return OS;
}
#endif

if (Stage == ClusteredLowOccupancyReschedule) {
if (StartingOccupancy <= MinOccupancy)
break;
GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)),
MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}

LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");
}
bool GCNSchedStage::initGCNSchedStage() {
if (!DAG.LIS)
return false;

if (Stage == PreRARematerialize) {
if (RegionsWithMinOcc.none() || Regions.size() == 1)
break;

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
// Check maximum occupancy
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
MinOccupancy)
break;

// FIXME: This pass will invalidate cached MBBLiveIns for regions
// inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass.
static_assert(LastStage == PreRARematerialize,
"Passes after PreRARematerialize are not supported");

collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
break;

LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
<< MinOccupancy << " from rematerializing\n");
}
}
LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
return true;
}

if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
bool UnclusteredRescheduleStage::initGCNSchedStage() {
if (!GCNSchedStage::initGCNSchedStage())
return false;

for (auto Region : Regions) {
if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
!RescheduleRegions[RegionIdx]) ||
(Stage == ClusteredLowOccupancyReschedule &&
!RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
if (DAG.RescheduleRegions.none())
return false;

++RegionIdx;
continue;
}
SavedMutations.swap(DAG.Mutations);

RegionBegin = Region.first;
RegionEnd = Region.second;
LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
return true;
}

if (RegionBegin->getParent() != MBB) {
if (MBB) finishBlock();
MBB = RegionBegin->getParent();
startBlock(MBB);
if (Stage == InitialSchedule)
computeBlockPressure(MBB);
}
bool ClusteredLowOccStage::initGCNSchedStage() {
if (!GCNSchedStage::initGCNSchedStage())
return false;

unsigned NumRegionInstrs = std::distance(begin(), end());
enterRegion(MBB, begin(), end(), NumRegionInstrs);
// Don't bother trying to improve ILP in lower RP regions if occupancy has not
// been dropped. All regions will have already been scheduled with the ideal
// occupancy targets.
if (DAG.StartingOccupancy <= DAG.MinOccupancy)
return false;

// Skip empty scheduling regions (0 or 1 schedulable instructions).
if (begin() == end() || begin() == std::prev(end())) {
exitRegion();
++RegionIdx;
continue;
}
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with lowest recorded occupancy "
<< DAG.MinOccupancy << ".\n");
return true;
}

LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " "
<< MBB->getName() << "\n From: " << *begin()
<< " To: ";
if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
else dbgs() << "End";
dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
bool PreRARematStage::initGCNSchedStage() {
if (!GCNSchedStage::initGCNSchedStage())
return false;

if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
return false;

schedule();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
// Check maximum occupancy
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
DAG.MinOccupancy)
return false;

// FIXME: This pass will invalidate cached MBBLiveIns for regions
// inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass.

collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
return false;

exitRegion();
++RegionIdx;
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
<< DAG.MinOccupancy << " from rematerializing\n");
return true;
}

void GCNSchedStage::finalizeGCNSchedStage() {
DAG.finishBlock();
LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
}

void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);

GCNSchedStage::finalizeGCNSchedStage();
}

bool GCNSchedStage::initGCNRegion() {
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
setupNewBlock();

unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);

// Skip empty scheduling regions (0 or 1 schedulable instructions).
if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
return false;

LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
<< " " << CurrentMBB->getName()
<< "\n From: " << *DAG.begin() << " To: ";
if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
else dbgs() << "End";
dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');

// Save original instruction order before scheduling for possible revert.
Unsched.clear();
Unsched.reserve(DAG.NumRegionInstrs);
for (auto &I : DAG)
Unsched.push_back(&I);

PressureBefore = DAG.Pressure[RegionIdx];

LLVM_DEBUG(
dbgs() << "Pressure before scheduling:\nRegion live-ins:";
GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI);
dbgs() << "Region live-in pressure: ";
llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));

// Set HasClusteredNodes to true for late stages where we have already
// collected it. That way pickNode() will not scan SDep's when not needed.
S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
S.HasExcessPressure = false;

return true;
}

bool UnclusteredRescheduleStage::initGCNRegion() {
if (!DAG.RescheduleRegions[RegionIdx])
return false;

return GCNSchedStage::initGCNRegion();
}

bool ClusteredLowOccStage::initGCNRegion() {
// We may need to reschedule this region if it doesn't have clusters so it
// wasn't rescheduled in the last stage, or if we found it was testing
// critical register pressure limits in the unclustered reschedule stage. The
// later is because we may not have been able to raise the min occupancy in
// the previous stage so the region may be overly constrained even if it was
// already rescheduled.
if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
return false;

return GCNSchedStage::initGCNRegion();
}

bool PreRARematStage::initGCNRegion() {
if (!DAG.RescheduleRegions[RegionIdx])
return false;

return GCNSchedStage::initGCNRegion();
}

void GCNSchedStage::setupNewBlock() {
if (CurrentMBB)
DAG.finishBlock();

CurrentMBB = DAG.RegionBegin->getParent();
DAG.startBlock(CurrentMBB);
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
if (StageID == GCNSchedStageID::InitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}

void GCNSchedStage::finalizeGCNRegion() {
DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
DAG.RescheduleRegions[RegionIdx] = false;
if (S.HasExcessPressure)
DAG.RegionsWithHighRP[RegionIdx] = true;

// Revert scheduling if we have dropped occupancy or there is some other
// reason that the original schedule is better.
checkScheduling();

DAG.exitRegion();
RegionIdx++;
}

void InitialScheduleStage::finalizeGCNRegion() {
// Record which regions have clustered nodes for the next unclustered
// reschedule stage.
assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
if (S.HasClusteredNodes)
DAG.RegionsWithClusters[RegionIdx] = true;

GCNSchedStage::finalizeGCNRegion();
}

void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
PressureAfter.print(dbgs()));

if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;

// Early out if we have achieve the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}

unsigned WavesAfter =
std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST));
unsigned WavesBefore =
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");

// We may not be able to keep the current target occupancy because of the just
// scheduled region. We might still be able to revert scheduling if the
// occupancy before was higher, or if the current schedule has register
// pressure higher than the excess limits which could lead to more spilling.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);

// Allow memory bound functions to drop to 4 waves if not limited by an
// attribute.
if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
WavesAfter >= MFI.getMinAllowedOccupancy()) {
LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
<< MFI.getMinAllowedOccupancy() << " waves\n");
NewOccupancy = WavesAfter;
}

if (NewOccupancy < DAG.MinOccupancy) {
DAG.MinOccupancy = NewOccupancy;
MFI.limitOccupancy(DAG.MinOccupancy);
DAG.RegionsWithMinOcc.reset();
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< DAG.MinOccupancy << ".\n");
}

unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
PressureAfter.getAGPRNum() > MaxVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs) {
DAG.RescheduleRegions[RegionIdx] = true;
DAG.RegionsWithHighRP[RegionIdx] = true;
}

// Revert if this region's schedule would cause a drop in occupancy or
// spilling.
if (shouldRevertScheduling(WavesAfter)) {
revertScheduling();
} else {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
}
}

bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
if (WavesAfter < DAG.MinOccupancy)
return true;

return false;
}

bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;

if (mayCauseSpilling(WavesAfter))
return true;

assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
// Don't reschedule the region in the next stage if it doesn't have clusters.
if (!DAG.RegionsWithClusters[RegionIdx])
DAG.RescheduleRegions[RegionIdx] = false;

return false;
}

bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;

// If RP is not reduced in the unclustred reschedule stage, revert to the old
// schedule.
if (!PressureAfter.less(ST, PressureBefore)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
return true;
}

return false;
}

bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;

if (mayCauseSpilling(WavesAfter))
return true;

return false;
}

bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;

if (mayCauseSpilling(WavesAfter))
return true;

return false;
}

bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() &&
!PressureAfter.less(ST, PressureBefore) &&
DAG.RescheduleRegions[RegionIdx]) {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
return true;
}

return false;
}

void GCNSchedStage::revertScheduling() {
DAG.RegionsWithMinOcc[RegionIdx] =
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RescheduleRegions[RegionIdx] =
DAG.RegionsWithClusters[RegionIdx] ||
(nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr()) {
++SkippedDebugInstr;
continue;
}

if (MI->getIterator() != DAG.RegionEnd) {
DAG.BB->remove(MI);
DAG.BB->insert(DAG.RegionEnd, MI);
if (!MI->isDebugInstr())
DAG.LIS->handleMove(*MI, true);
}

// Reset read-undef flags and update them later.
for (auto &Op : MI->operands())
if (Op.isReg() && Op.isDef())
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
if (!MI->isDebugInstr()) {
if (DAG.ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
} else {
// Adjust for missing dead-def flags.
RegOpers.detectDeadDefs(*MI, *DAG.LIS);
}
}
finishBlock();
DAG.RegionEnd = MI->getIterator();
++DAG.RegionEnd;
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}

// After reverting schedule, debug instrs will now be at the end of the block
// and RegionEnd will point to the first debug instr. Increment RegionEnd
// pass debug instrs to the actual end of the scheduling region.
while (SkippedDebugInstr-- > 0)
++DAG.RegionEnd;

// If Unsched.front() instruction is a debug instruction, this will actually
// shrink the region since we moved all debug instructions to the end of the
// block. Find the first instruction that is not a debug instruction.
DAG.RegionBegin = Unsched.front()->getIterator();
if (DAG.RegionBegin->isDebugInstr()) {
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
continue;
DAG.RegionBegin = MI->getIterator();
break;
}
}

// Then move the debug instructions back into their correct place and set
// RegionBegin and RegionEnd if needed.
DAG.placeDebugValues();

if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
} while (Stage != LastStage);
DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
}

void GCNScheduleDAGMILive::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
void PreRARematStage::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
Register Reg = Register::index2VirtReg(I);
if (!LIS->hasInterval(Reg))
if (!DAG.LIS->hasInterval(Reg))
continue;

// TODO: Handle AGPR and SGPR rematerialization
if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
!MRI.hasOneNonDBGUse(Reg))
if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
!DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
continue;

MachineOperand *Op = MRI.getOneDef(Reg);
MachineOperand *Op = DAG.MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
continue;

MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
if (Def->getParent() == UseI->getParent())
continue;

// We are only collecting defs that are defined in another block and are
// live-through or used inside regions at MinOccupancy. This means that the
// register must be in the live-in set for the region.
bool AddedToRematList = false;
for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
auto It = LiveIns[I].find(Reg);
if (It != LiveIns[I].end() && !It->second.none()) {
if (RegionsWithMinOcc[I]) {
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
auto It = DAG.LiveIns[I].find(Reg);
if (It != DAG.LiveIns[I].end() && !It->second.none()) {
if (DAG.RegionsWithMinOcc[I]) {
RematerializableInsts[I][Def] = UseI;
AddedToRematList = true;
}
Expand All @@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
}
}

bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII) {
bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII) {
// Temporary copies of cached variables we will be modifying and replacing if
// sinking succeeds.
SmallVector<
Expand All @@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
DenseMap<unsigned, GCNRegPressure> NewPressure;
BitVector NewRescheduleRegions;
LiveIntervals *LIS = DAG.LIS;

NewRegions.resize(Regions.size());
NewRescheduleRegions.resize(Regions.size());
NewRegions.resize(DAG.Regions.size());
NewRescheduleRegions.resize(DAG.Regions.size());

// Collect only regions that has a rematerializable def as a live-in.
SmallSet<unsigned, 16> ImpactedRegions;
Expand All @@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// Make copies of register pressure and live-ins cache that will be updated
// as we rematerialize.
for (auto Idx : ImpactedRegions) {
NewPressure[Idx] = Pressure[Idx];
NewLiveIns[Idx] = LiveIns[Idx];
NewPressure[Idx] = DAG.Pressure[Idx];
NewLiveIns[Idx] = DAG.LiveIns[Idx];
}
NewRegions = Regions;
NewRegions = DAG.Regions;
NewRescheduleRegions.reset();

DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
bool Improved = false;
for (auto I : ImpactedRegions) {
if (!RegionsWithMinOcc[I])
if (!DAG.RegionsWithMinOcc[I])
continue;

Improved = false;
Expand All @@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,

// TODO: Handle occupancy drop due to AGPR and SGPR.
// Check if cause of occupancy drop is due to VGPR usage and not SGPR.
if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
break;

// The occupancy of this region could have been improved by a previous
// iteration's sinking of defs.
if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
NewRescheduleRegions[I] = true;
Improved = true;
continue;
Expand All @@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
if (OptimisticOccupancy <= MinOccupancy)
if (OptimisticOccupancy <= DAG.MinOccupancy)
break;

unsigned ImproveOccupancy = 0;
Expand All @@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// call LiveRangeEdit::allUsesAvailableAt() and
// LiveRangeEdit::canRematerializeAt().
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
Def->getOperand(0).getSubReg(), *Def, *TRI);
Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
MachineInstr *NewMI = &*(--InsertPos);
LIS->InsertMachineInstrInMaps(*NewMI);
LIS->removeInterval(Reg);
Expand All @@ -851,21 +1013,21 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,

// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region
updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
/*Removing =*/true);
DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
/*Removing =*/true);

// Update region boundaries in region we sinked to.
updateRegionBoundaries(NewRegions, InsertPos, NewMI);
DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);

LaneBitmask PrevMask = NewLiveIns[I][Reg];
// FIXME: Also update cached pressure for where the def was sinked from.
// Update RP for all regions that has this reg as a live-in and remove
// the reg from all regions as a live-in.
for (auto Idx : RematDefToLiveInRegions[Def]) {
NewLiveIns[Idx].erase(Reg);
if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
// Def is live-through and not used in this block.
NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
} else {
// Def is used and rematerialized into this block.
GCNDownwardRPTracker RPT(*LIS);
Expand All @@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,

SinkedDefs.push_back(Def);
ImproveOccupancy = NewPressure[I].getOccupancy(ST);
if (ImproveOccupancy > MinOccupancy)
if (ImproveOccupancy > DAG.MinOccupancy)
break;
}

Expand All @@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
for (auto TrackedIdx : RematDefToLiveInRegions[Def])
RematerializableInsts[TrackedIdx].erase(Def);

if (ImproveOccupancy <= MinOccupancy)
if (ImproveOccupancy <= DAG.MinOccupancy)
break;

NewRescheduleRegions[I] = true;
Expand Down Expand Up @@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
MachineInstr *OldMI = Entry.second;

// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
BBLiveInMap.erase(OldMI);
DAG.BBLiveInMap.erase(OldMI);

// Remove OldMI and update LIS
Register Reg = MI->getOperand(0).getReg();
Expand All @@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,

// Update live-ins, register pressure, and regions caches.
for (auto Idx : ImpactedRegions) {
LiveIns[Idx] = NewLiveIns[Idx];
Pressure[Idx] = NewPressure[Idx];
MBBLiveIns.erase(Regions[Idx].first->getParent());
DAG.LiveIns[Idx] = NewLiveIns[Idx];
DAG.Pressure[Idx] = NewPressure[Idx];
DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
}
Regions = NewRegions;
RescheduleRegions = NewRescheduleRegions;
DAG.Regions = NewRegions;
DAG.RescheduleRegions = NewRescheduleRegions;

SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++MinOccupancy);
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);

return true;
}

// Copied from MachineLICM
bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
if (!TII->isTriviallyReMaterializable(MI))
bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
if (!DAG.TII->isTriviallyReMaterializable(MI))
return false;

for (const MachineOperand &MO : MI.operands())
Expand Down
233 changes: 189 additions & 44 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ class GCNSubtarget;
/// heuristics to determine excess/critical pressure sets. Its goal is to
/// maximize kernel occupancy (i.e. maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
friend class GCNScheduleDAGMILive;

SUnit *pickNodeBidirectional(bool &IsTopNode);

void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
Expand All @@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
unsigned SGPRPressure, unsigned VGPRPressure);

std::vector<unsigned> Pressure;

std::vector<unsigned> MaxPressure;

unsigned SGPRExcessLimit;

unsigned VGPRExcessLimit;
unsigned SGPRCriticalLimit;
unsigned VGPRCriticalLimit;

unsigned TargetOccupancy;

MachineFunction *MF;

public:
// schedule() have seen a clustered memory operation. Set it to false
// before a region scheduling to know if the region had such clusters.
bool HasClusteredNodes;
Expand All @@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
// register pressure for actual scheduling heuristics.
bool HasExcessPressure;

MachineFunction *MF;
unsigned SGPRCriticalLimit;

unsigned VGPRCriticalLimit;

public:
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);

SUnit *pickNode(bool &IsTopNode) override;

void initialize(ScheduleDAGMI *DAG) override;

unsigned getTargetOccupancy() { return TargetOccupancy; }

void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};

class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
enum class GCNSchedStageID : unsigned {
InitialSchedule = 0,
UnclusteredReschedule = 1,
ClusteredLowOccupancyReschedule = 2,
PreRARematerialize = 3,
LastStage = PreRARematerialize
};

#ifndef NDEBUG
raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
#endif

inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) {
assert(Stage != GCNSchedStageID::PreRARematerialize);
Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
return Stage;
}

inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
}

enum : unsigned {
Collect,
InitialSchedule,
UnclusteredReschedule,
ClusteredLowOccupancyReschedule,
PreRARematerialize,
LastStage = PreRARematerialize
};
inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
}

class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class InitialScheduleStage;
friend class UnclusteredRescheduleStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;

const GCNSubtarget &ST;

Expand All @@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Minimal real occupancy recorder for the function.
unsigned MinOccupancy;

// Scheduling stage number.
unsigned Stage;

// Current region index.
size_t RegionIdx;

// Vector of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
Expand All @@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Region pressure cache.
SmallVector<GCNRegPressure, 32> Pressure;

// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;

// Return current region pressure.
GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;

// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);

// Update region boundaries when removing MI or inserting NewMI before MI.
void updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
MachineBasicBlock::iterator MI, MachineInstr *NewMI,
bool Removing = false);

void runSchedStages();

public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);

void schedule() override;

void finalizeSchedule() override;
};

// GCNSchedStrategy applies multiple scheduling stages to a function.
class GCNSchedStage {
protected:
GCNScheduleDAGMILive &DAG;

GCNMaxOccupancySchedStrategy &S;

MachineFunction &MF;

SIMachineFunctionInfo &MFI;

const GCNSubtarget &ST;

const GCNSchedStageID StageID;

// The current block being scheduled.
MachineBasicBlock *CurrentMBB = nullptr;

// Current region index.
unsigned RegionIdx = 0;

// Record the original order of instructions before scheduling.
std::vector<MachineInstr *> Unsched;

// RP before scheduling the current region.
GCNRegPressure PressureBefore;

// RP after scheduling the current region.
GCNRegPressure PressureAfter;

GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);

public:
// Initialize state for a scheduling stage. Returns false if the current stage
// should be skipped.
virtual bool initGCNSchedStage();

// Finalize state after finishing a scheduling pass on the function.
virtual void finalizeGCNSchedStage();

// Setup for scheduling a region. Returns false if the current region should
// be skipped.
virtual bool initGCNRegion();

// Track whether a new region is also a new MBB.
void setupNewBlock();

// Finalize state after scheudling a region.
virtual void finalizeGCNRegion();

// Check result of scheduling.
void checkScheduling();

// Returns true if scheduling should be reverted.
virtual bool shouldRevertScheduling(unsigned WavesAfter);

// Returns true if the new schedule may result in more spilling.
bool mayCauseSpilling(unsigned WavesAfter);

// Attempt to revert scheduling for this region.
void revertScheduling();

void advanceRegion() { RegionIdx++; }

virtual ~GCNSchedStage() = default;
};

class InitialScheduleStage : public GCNSchedStage {
public:
void finalizeGCNRegion() override;

bool shouldRevertScheduling(unsigned WavesAfter) override;

InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};

class UnclusteredRescheduleStage : public GCNSchedStage {
private:
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

public:
bool initGCNSchedStage() override;

void finalizeGCNSchedStage() override;

bool initGCNRegion() override;

bool shouldRevertScheduling(unsigned WavesAfter) override;

UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};

// Retry function scheduling if we found resulting occupancy and it is
// lower than used for other scheduling passes. This will give more freedom
// to schedule low register pressure blocks.
class ClusteredLowOccStage : public GCNSchedStage {
public:
bool initGCNSchedStage() override;

bool initGCNRegion() override;

bool shouldRevertScheduling(unsigned WavesAfter) override;

ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};

class PreRARematStage : public GCNSchedStage {
private:
// Each region at MinOccupancy will have their own list of trivially
// rematerializable instructions we can remat to reduce RP. The list maps an
// instruction to the position we should remat before, usually the MI using
Expand All @@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// that has the defined reg as a live-in.
DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;

// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;

// Collect all trivially rematerializable VGPR instructions with a single def
// and single use outside the defining block into RematerializableInsts.
void collectRematerializableInstructions();
Expand All @@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);

// Return current region pressure.
GCNRegPressure getRealRegPressure() const;

// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(const MachineBasicBlock *MBB);

// Update region boundaries when removing MI or inserting NewMI before MI.
void updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
MachineBasicBlock::iterator MI, MachineInstr *NewMI,
bool Removing = false);

public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);
bool initGCNSchedStage() override;

void schedule() override;
bool initGCNRegion() override;

void finalizeSchedule() override;
bool shouldRevertScheduling(unsigned WavesAfter) override;

PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};

} // End namespace llvm
Expand Down