diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1faf910c91f1e..643eb0b63af6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -426,6 +426,15 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { return DAG; } +static ScheduleDAGInstrs * +createGCNMaxILPMachineScheduler(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = + new GCNScheduleDAGMILive(C, std::make_unique(C)); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createSchedBarrierDAGMutation()); + return DAG; +} + static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { const GCNSubtarget &ST = C->MF->getSubtarget(); @@ -464,19 +473,23 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy", createGCNMaxOccupancyMachineScheduler); static MachineSchedRegistry -IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", - "Run GCN scheduler to maximize occupancy (experimental)", - createIterativeGCNMaxOccupancyMachineScheduler); - -static MachineSchedRegistry -GCNMinRegSchedRegistry("gcn-minreg", - "Run GCN iterative scheduler for minimal register usage (experimental)", - createMinRegScheduler); - -static MachineSchedRegistry -GCNILPSchedRegistry("gcn-ilp", - "Run GCN iterative scheduler for ILP scheduling (experimental)", - createIterativeILPMachineScheduler); + GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", + createGCNMaxILPMachineScheduler); + +static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( + "gcn-iterative-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry GCNMinRegSchedRegistry( + "gcn-iterative-minreg", + "Run GCN iterative scheduler for minimal register usage (experimental)", + createMinRegScheduler); + +static MachineSchedRegistry GCNILPSchedRegistry( + "gcn-iterative-ilp", + "Run GCN iterative scheduler for ILP scheduling (experimental)", + createIterativeILPMachineScheduler); static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 4b90513b4dede..1d83f2e3011d1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -38,12 +38,11 @@ cl::opt "reduction scheduling stage."), cl::init(false)); -GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) +GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), HasHighPressure(false) {} -void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { +void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); MF = &DAG->MF; @@ -74,8 +73,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); } -void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, - bool AtTop, const RegPressureTracker &RPTracker, +void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, + const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure) { @@ -161,7 +161,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() -void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, +void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand) { @@ -181,7 +181,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SGPRPressure, VGPRPressure); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; - GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); + tryCandidate(Cand, TryCand, ZoneArg); if (TryCand.Reason != NoCand) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) @@ -194,7 +194,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { @@ -259,7 +259,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { dbgs() << "Bot Cand: "; traceCandidate(BotCand);); SchedCandidate Cand = BotCand; TopCand.Reason = NoCand; - GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + tryCandidate(Cand, TopCand, nullptr); if (TopCand.Reason != NoCand) { Cand.setBest(TopCand); } @@ -271,7 +271,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // This function is mostly cut and pasted from // GenericScheduler::pickNode() -SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { if (DAG->top() == DAG->bottom()) { assert(Top.Available.empty() && Top.Pending.empty() && Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); @@ -314,6 +314,129 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { return SU; } +GCNSchedStageID GCNSchedStrategy::getCurrentStage() { + assert(CurrentStage && CurrentStage != SchedStages.end()); + return *CurrentStage; +} + +bool GCNSchedStrategy::advanceStage() { + assert(CurrentStage != SchedStages.end()); + if (!CurrentStage) + CurrentStage = SchedStages.begin(); + else + CurrentStage++; + + return CurrentStage != SchedStages.end(); +} + +bool GCNSchedStrategy::hasNextStage() const { + assert(CurrentStage); + return std::next(CurrentStage) != SchedStages.end(); +} + +GCNSchedStageID GCNSchedStrategy::getNextStage() const { + assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); + return *std::next(CurrentStage); +} + +GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( + const MachineSchedContext *C) + : GCNSchedStrategy(C) { + SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); + SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); + SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); + SchedStages.push_back(GCNSchedStageID::PreRARematerialize); +} + +GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) + : GCNSchedStrategy(C) { + SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); +} + +bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Avoid spilling by exceeding the register limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return TryCand.Reason != NoCand; + + // Avoid critical resource consumption and balance the schedule. + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + + // Unconditionally try to reduce latency. + if (tryLatency(TryCand, Cand, *Zone)) + return TryCand.Reason != NoCand; + + // Weak edges are for clustering and other constraints. + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) + return TryCand.Reason != NoCand; + } + + // Keep clustered nodes together to encourage downstream peephole + // optimizations which may reduce resource requirements. + // + // This is a best effort to set things up for a post-RA pass. Optimizations + // like generating loads of multiple registers should ideally be done within + // the scheduler pass by combining the loads during DAG postprocessing. + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + if (tryGreater(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max pressure of the entire region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, + Cand, RegMax, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + if (SameBoundary) { + // Fall through to original instruction order. + if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || + (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + return true; + } + } + return false; +} + GCNScheduleDAGMILive::GCNScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr S) : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget()), @@ -323,6 +446,22 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive( LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } +std::unique_ptr +GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { + switch (SchedStageID) { + case GCNSchedStageID::OccInitialSchedule: + return std::make_unique(SchedStageID, *this); + case GCNSchedStageID::UnclusteredHighRPReschedule: + return std::make_unique(SchedStageID, *this); + case GCNSchedStageID::ClusteredLowOccupancyReschedule: + return std::make_unique(SchedStageID, *this); + case GCNSchedStageID::PreRARematerialize: + return std::make_unique(SchedStageID, *this); + case GCNSchedStageID::ILPInitialSchedule: + return std::make_unique(SchedStageID, *this); + } +} + void GCNScheduleDAGMILive::schedule() { // Collect all scheduling regions. The actual scheduling is performed in // GCNScheduleDAGMILive::finalizeSchedule. @@ -439,18 +578,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() { void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this); - UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule, - *this); - ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule, - *this); - PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this); - GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3}; if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); - for (auto *Stage : SchedStages) { + GCNSchedStrategy &S = static_cast(*SchedImpl); + while (S.advanceStage()) { + auto Stage = createSchedStage(S.getCurrentStage()); if (!Stage->initGCNSchedStage()) continue; @@ -475,8 +609,8 @@ void GCNScheduleDAGMILive::runSchedStages() { #ifndef NDEBUG raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { switch (StageID) { - case GCNSchedStageID::InitialSchedule: - OS << "Initial Schedule"; + case GCNSchedStageID::OccInitialSchedule: + OS << "Max Occupancy Initial Schedule"; break; case GCNSchedStageID::UnclusteredHighRPReschedule: OS << "Unclustered High Register Pressure Reschedule"; @@ -487,14 +621,18 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { case GCNSchedStageID::PreRARematerialize: OS << "Pre-RA Rematerialize"; break; + case GCNSchedStageID::ILPInitialSchedule: + OS << "Max ILP Initial Schedule"; + break; } + return OS; } #endif GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) - : DAG(DAG), S(static_cast(*DAG.SchedImpl)), - MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} + : DAG(DAG), S(static_cast(*DAG.SchedImpl)), MF(DAG.MF), + MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} bool GCNSchedStage::initGCNSchedStage() { if (!DAG.LIS) @@ -564,6 +702,7 @@ bool PreRARematStage::initGCNSchedStage() { // inbetween the defs and region we sinked the def to. Cached pressure // for regions where a def is sinked from will also be invalidated. Will // need to be fixed if there is another pass after this pass. + assert(!S.hasNextStage()); collectRematerializableInstructions(); if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) @@ -674,7 +813,7 @@ void GCNSchedStage::setupNewBlock() { DAG.startBlock(CurrentMBB); // Get real RP for the region if it hasn't be calculated before. After the // initial schedule stage real RP will be collected after scheduling. - if (StageID == GCNSchedStageID::InitialSchedule) + if (StageID == GCNSchedStageID::OccInitialSchedule) DAG.computeBlockPressure(RegionIdx, CurrentMBB); } @@ -767,7 +906,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { return false; } -bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { +bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) return true; @@ -810,6 +949,13 @@ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return false; } +bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { + if (mayCauseSpilling(WavesAfter)) + return true; + + return false; +} + bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { if (WavesAfter <= MFI.getMinWavesPerEU() && !PressureAfter.less(ST, PressureBefore) && @@ -826,7 +972,8 @@ void GCNSchedStage::revertScheduling() { PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RescheduleRegions[RegionIdx] = - (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule; + S.hasNextStage() && + S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; for (MachineInstr *MI : Unsched) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index ffa68bae67a1c..94d14312d4197 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -22,12 +22,25 @@ namespace llvm { class SIMachineFunctionInfo; class SIRegisterInfo; class GCNSubtarget; +class GCNSchedStage; + +enum class GCNSchedStageID : unsigned { + OccInitialSchedule = 0, + UnclusteredHighRPReschedule = 1, + ClusteredLowOccupancyReschedule = 2, + PreRARematerialize = 3, + ILPInitialSchedule = 4 +}; + +#ifndef NDEBUG +raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); +#endif /// This is a minimal scheduler strategy. The main difference between this /// and the GenericScheduler is that GCNSchedStrategy uses different -/// heuristics to determine excess/critical pressure sets. Its goal is to -/// maximize kernel occupancy (i.e. maximum number of waves per simd). -class GCNMaxOccupancySchedStrategy final : public GenericScheduler { +/// heuristics to determine excess/critical pressure sets. +class GCNSchedStrategy : public GenericScheduler { +protected: SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, @@ -51,6 +64,12 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { MachineFunction *MF; + // Scheduling stages for this strategy. + SmallVector SchedStages; + + // Pointer to the current SchedStageID. + SmallVectorImpl::iterator CurrentStage = nullptr; + public: // schedule() have seen register pressure over the critical limits and had to // track register pressure for actual scheduling heuristics. @@ -69,7 +88,7 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { unsigned VGPRCriticalLimit; - GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + GCNSchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; @@ -78,40 +97,42 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { unsigned getTargetOccupancy() { return TargetOccupancy; } void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } -}; -enum class GCNSchedStageID : unsigned { - InitialSchedule = 0, - UnclusteredHighRPReschedule = 1, - ClusteredLowOccupancyReschedule = 2, - PreRARematerialize = 3, - LastStage = PreRARematerialize -}; + GCNSchedStageID getCurrentStage(); -#ifndef NDEBUG -raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); -#endif + // Advances stage. Returns true if there are remaining stages. + bool advanceStage(); -inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) { - assert(Stage != GCNSchedStageID::PreRARematerialize); - Stage = static_cast(static_cast(Stage) + 1); - return Stage; -} + bool hasNextStage() const; + + GCNSchedStageID getNextStage() const; +}; -inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) { - return static_cast(static_cast(Stage) + 1); -} +/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. +/// maximum number of waves per simd). +class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy { +public: + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); +}; -inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) { - return static_cast(LHS) > static_cast(RHS); -} +/// The goal of this scheduling strategy is to maximize ILP for a single wave +/// (i.e. latency hiding). +class GCNMaxILPSchedStrategy final : public GCNSchedStrategy { +protected: + bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const override; + +public: + GCNMaxILPSchedStrategy(const MachineSchedContext *C); +}; class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; - friend class InitialScheduleStage; + friend class OccInitialScheduleStage; friend class UnclusteredHighRPStage; friend class ClusteredLowOccStage; friend class PreRARematStage; + friend class ILPInitialScheduleStage; const GCNSubtarget &ST; @@ -169,6 +190,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { void runSchedStages(); + std::unique_ptr createSchedStage(GCNSchedStageID SchedStageID); + public: GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr S); @@ -183,7 +206,7 @@ class GCNSchedStage { protected: GCNScheduleDAGMILive &DAG; - GCNMaxOccupancySchedStrategy &S; + GCNSchedStrategy &S; MachineFunction &MF; @@ -245,11 +268,11 @@ class GCNSchedStage { virtual ~GCNSchedStage() = default; }; -class InitialScheduleStage : public GCNSchedStage { +class OccInitialScheduleStage : public GCNSchedStage { public: bool shouldRevertScheduling(unsigned WavesAfter) override; - InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) : GCNSchedStage(StageID, DAG) {} }; @@ -324,6 +347,14 @@ class PreRARematStage : public GCNSchedStage { : GCNSchedStage(StageID, DAG) {} }; +class ILPInitialScheduleStage : public GCNSchedStage { +public: + bool shouldRevertScheduling(unsigned WavesAfter) override; + + ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + } // End namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll index 437e3a78ad593..ea5062c4925c4 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll index eaa30bbb01012..a9e395108698e 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -1,6 +1,6 @@ ; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s ; We expect a two digit VGPR usage here, not a three digit. ; CHECK: NumVgprs: {{[0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index e209f9e6196d5..87b2d29c8b0fb 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s -; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s -; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s +; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s +; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll index e5f08dbedce3d..d567e1535f64c 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.