From 305fb4fbfadccd26c39249a725e9e06bea8a9102 Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Tue, 25 Nov 2025 22:18:19 -0800 Subject: [PATCH] [AMDGPU] Add structural stall heuristic to scheduling strategies Implements a structural stall heuristic that considers both resource hazards and latency constraints when selecting instructions from the pending queue. - Add getStructuralStallCycles() to GCNSchedStrategy that computes the number of cycles an instruction must wait due to: - Resource conflicts on unbuffered resources (from the SchedModel) - Sequence-dependent hazards (from GCNHazardRecognizer) - Add getHazardWaitStates() to GCNHazardRecognizer that returns the number of wait states until all hazards for an instruction are resolved, providing cycle-accurate hazard information for scheduling heuristics. --- .../Target/AMDGPU/AMDGPUMLSchedStrategy.cpp | 72 +++++++++++++++++++ .../lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h | 3 + .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 4 ++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 6 ++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 35 +++++++++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 8 ++- .../AMDGPU/ml-sched-effective-stall.mir | 8 ++- 7 files changed, 131 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp index 8c68223c0a492..08cf930de3178 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp @@ -13,6 +13,10 @@ #include "AMDGPUMLSchedStrategy.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "machine-scheduler" + using namespace llvm; AMDGPUMLSchedStrategy::AMDGPUMLSchedStrategy(const MachineSchedContext *C) @@ -130,6 +134,74 @@ bool AMDGPUMLSchedStrategy::tryCandidate(SchedCandidate &Cand, return false; } +bool AMDGPUMLSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // Compare effective stall cycles between candidates. + // Effective stall = max(structural stall, latency stall) + // - Structural stalls: resource/hazard constraints (HW not ready) + // - Latency stalls: data dependency constraints (operands not ready) + // + // This allows picking a pending instruction with structural stalls over + // an available instruction with higher latency stalls (e.g., scheduling + // a WMMA while waiting for a memory load result). + unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU); + unsigned TryLatencyStall = Zone->getLatencyStallCycles(TryCand.SU); + unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall); + + unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU); + unsigned CandLatencyStall = Zone->getLatencyStallCycles(Cand.SU); + unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall); + + LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) { + dbgs() << "Effective stalls: try=" << TryEffectiveStall + << " (struct=" << TryStructStall << ", lat=" << TryLatencyStall + << ") cand=" << CandEffectiveStall + << " (struct=" << CandStructStall << ", lat=" << CandLatencyStall + << ")\n"; + }); + + if (tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall)) + return TryCand.Reason != NoCand; + + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + AMDGPUMLPostSchedStrategy::AMDGPUMLPostSchedStrategy( const MachineSchedContext *C) : PostGenericScheduler(C) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h index fd13b57a28f43..05465668e0014 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h @@ -24,6 +24,9 @@ class AMDGPUMLSchedStrategy final : public GCNSchedStrategy { bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override; + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const override; + public: AMDGPUMLSchedStrategy(const MachineSchedContext *C); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 7fbf520c670ae..ff955c7354cba 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -313,6 +313,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { return std::max(W, NopPadding.getValue()); } +unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const { + return const_cast(this)->PreEmitNoopsCommon(MI); +} + unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (MI->isBundle()) return 0; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 67beffadc0913..be914d8657870 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -145,6 +145,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { void EmitInstruction(SUnit *SU) override; void EmitInstruction(MachineInstr *MI) override; HazardType getHazardType(SUnit *SU, int Stalls) override; + + /// Returns the number of wait states until all hazards for \p MI are + /// resolved. This is useful for scheduling heuristics that want + /// cycle-accurate hazard information rather than just a boolean. Unlike + /// PreEmitNoops, this does not modify state or fix hazards. + unsigned getHazardWaitStates(MachineInstr *MI) const; void EmitNoop() override; unsigned PreEmitNoops(MachineInstr *) override; unsigned PreEmitNoopsCommon(MachineInstr *); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index b9362c41cdb7c..ee76368d08859 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -25,6 +25,7 @@ #include "GCNSchedStrategy.h" #include "AMDGPUIGroupLP.h" +#include "GCNHazardRecognizer.h" #include "GCNRegPressure.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -218,6 +219,40 @@ void GCNSchedStrategy::getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } +unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone, + SUnit *SU) const { + // Only implemented for top-down scheduling currently. + if (!Zone.isTop() || !SU) + return 0; + + MachineInstr *MI = SU->getInstr(); + unsigned CurrCycle = Zone.getCurrCycle(); + unsigned Stall = 0; + + // Query SchedModel for resource stalls (unbuffered resources). + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { + const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + for (const MCWriteProcResEntry &PE : + make_range(SchedModel->getWriteProcResBegin(SC), + SchedModel->getWriteProcResEnd(SC))) { + unsigned NextAvail = + Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle, + PE.AcquireAtCycle) + .first; + if (NextAvail > CurrCycle) + Stall = std::max(Stall, NextAvail - CurrCycle); + } + } + + // Query HazardRecognizer for sequence-dependent hazard penalties. + if (Zone.HazardRec && Zone.HazardRec->isEnabled()) { + auto *HR = static_cast(Zone.HazardRec); + Stall = std::max(Stall, HR->getHazardWaitStates(MI)); + } + + return Stall; +} + void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 367f47c3ca4ae..048eeecac0ab9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -56,6 +56,10 @@ class GCNSchedStrategy : public GenericScheduler { const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Estimate how many cycles \p SU must wait due to structural hazards at the + /// current boundary cycle. Returns zero when no stall is required. + unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const; + /// Evaluates instructions in the pending queue using a subset of scheduling /// heuristics. /// @@ -64,8 +68,8 @@ class GCNSchedStrategy : public GenericScheduler { /// invisible to scheduling heuristics. However, in certain scenarios (such as /// avoiding register spilling), it may be beneficial to consider scheduling /// these not-yet-ready instructions. - bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const; + virtual bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred); diff --git a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir index bb82c7364d0ff..0a04e7964e63c 100644 --- a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir +++ b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir @@ -12,6 +12,8 @@ attributes #1 = { "amdgpu-waves-per-eu"="1,1" } ... +# The scheduler should reorder the use of the global load after WMMAs to hide memory latency. + --- name: with_ml_workload_attr tracksRegLiveness: true @@ -31,8 +33,8 @@ body: | ; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec ; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 ; ; ML-LABEL: name: with_ml_workload_attr @@ -49,8 +51,8 @@ body: | ; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 %0:vreg_512_align2 = IMPLICIT_DEF %1:vreg_512_align2 = IMPLICIT_DEF @@ -108,8 +110,8 @@ body: | ; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 %0:vreg_512_align2 = IMPLICIT_DEF %1:vreg_512_align2 = IMPLICIT_DEF