From 305fb4fbfadccd26c39249a725e9e06bea8a9102 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Tue, 25 Nov 2025 22:18:19 -0800
Subject: [PATCH] [AMDGPU] Add structural stall heuristic to scheduling
 strategies

Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions from the
pending queue.

- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
  - Resource conflicts on unbuffered resources (from the SchedModel)
  - Sequence-dependent hazards (from GCNHazardRecognizer)

- Add getHazardWaitStates() to GCNHazardRecognizer that returns the number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
---
 .../Target/AMDGPU/AMDGPUMLSchedStrategy.cpp   | 72 +++++++++++++++++++
 .../lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h |  3 +
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  4 ++
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  6 ++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 35 +++++++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  8 ++-
 .../AMDGPU/ml-sched-effective-stall.mir       |  8 ++-
 7 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
index 8c68223c0a492..08cf930de3178 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
@@ -13,6 +13,10 @@
 
 #include "AMDGPUMLSchedStrategy.h"
 
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
 using namespace llvm;
 
 AMDGPUMLSchedStrategy::AMDGPUMLSchedStrategy(const MachineSchedContext *C)
@@ -130,6 +134,74 @@ bool AMDGPUMLSchedStrategy::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
+bool AMDGPUMLSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+                                                SchedCandidate &TryCand,
+                                                SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // Compare effective stall cycles between candidates.
+    // Effective stall = max(structural stall, latency stall)
+    // - Structural stalls: resource/hazard constraints (HW not ready)
+    // - Latency stalls: data dependency constraints (operands not ready)
+    //
+    // This allows picking a pending instruction with structural stalls over
+    // an available instruction with higher latency stalls (e.g., scheduling
+    // a WMMA while waiting for a memory load result).
+    unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU);
+    unsigned TryLatencyStall = Zone->getLatencyStallCycles(TryCand.SU);
+    unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall);
+
+    unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU);
+    unsigned CandLatencyStall = Zone->getLatencyStallCycles(Cand.SU);
+    unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall);
+
+    LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
+      dbgs() << "Effective stalls: try=" << TryEffectiveStall
+             << " (struct=" << TryStructStall << ", lat=" << TryLatencyStall
+             << ") cand=" << CandEffectiveStall
+             << " (struct=" << CandStructStall << ", lat=" << CandLatencyStall
+             << ")\n";
+    });
+
+    if (tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall))
+      return TryCand.Reason != NoCand;
+
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+  }
+
+  return false;
+}
+
 AMDGPUMLPostSchedStrategy::AMDGPUMLPostSchedStrategy(
     const MachineSchedContext *C)
     : PostGenericScheduler(C) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
index fd13b57a28f43..05465668e0014 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
@@ -24,6 +24,9 @@ class AMDGPUMLSchedStrategy final : public GCNSchedStrategy {
   bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                     SchedBoundary *Zone) const override;
 
+  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                           SchedBoundary *Zone) const override;
+
 public:
   AMDGPUMLSchedStrategy(const MachineSchedContext *C);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 7fbf520c670ae..ff955c7354cba 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -313,6 +313,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   return std::max(W, NopPadding.getValue());
 }
 
+unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
+  return const_cast<GCNHazardRecognizer *>(this)->PreEmitNoopsCommon(MI);
+}
+
 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (MI->isBundle())
     return 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffadc0913..be914d8657870 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -145,6 +145,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   void EmitInstruction(SUnit *SU) override;
   void EmitInstruction(MachineInstr *MI) override;
   HazardType getHazardType(SUnit *SU, int Stalls) override;
+
+  /// Returns the number of wait states until all hazards for \p MI are
+  /// resolved. This is useful for scheduling heuristics that want
+  /// cycle-accurate hazard information rather than just a boolean.  Unlike
+  /// PreEmitNoops, this does not modify state or fix hazards.
+  unsigned getHazardWaitStates(MachineInstr *MI) const;
   void EmitNoop() override;
   unsigned PreEmitNoops(MachineInstr *) override;
   unsigned PreEmitNoopsCommon(MachineInstr *);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b9362c41cdb7c..ee76368d08859 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,6 +25,7 @@
 
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
+#include "GCNHazardRecognizer.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -218,6 +219,40 @@ void GCNSchedStrategy::getRegisterPressures(
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
+unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
+                                                    SUnit *SU) const {
+  // Only implemented for top-down scheduling currently.
+  if (!Zone.isTop() || !SU)
+    return 0;
+
+  MachineInstr *MI = SU->getInstr();
+  unsigned CurrCycle = Zone.getCurrCycle();
+  unsigned Stall = 0;
+
+  // Query SchedModel for resource stalls (unbuffered resources).
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (const MCWriteProcResEntry &PE :
+         make_range(SchedModel->getWriteProcResBegin(SC),
+                    SchedModel->getWriteProcResEnd(SC))) {
+      unsigned NextAvail =
+          Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
+                                    PE.AcquireAtCycle)
+              .first;
+      if (NextAvail > CurrCycle)
+        Stall = std::max(Stall, NextAvail - CurrCycle);
+    }
+  }
+
+  // Query HazardRecognizer for sequence-dependent hazard penalties.
+  if (Zone.HazardRec && Zone.HazardRec->isEnabled()) {
+    auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
+    Stall = std::max(Stall, HR->getHazardWaitStates(MI));
+  }
+
+  return Stall;
+}
+
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
                                      const RegPressureTracker &RPTracker,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 367f47c3ca4ae..048eeecac0ab9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -56,6 +56,10 @@ class GCNSchedStrategy : public GenericScheduler {
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Estimate how many cycles \p SU must wait due to structural hazards at the
+  /// current boundary cycle. Returns zero when no stall is required.
+  unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const;
+
   /// Evaluates instructions in the pending queue using a subset of scheduling
   /// heuristics.
   ///
@@ -64,8 +68,8 @@ class GCNSchedStrategy : public GenericScheduler {
   /// invisible to scheduling heuristics. However, in certain scenarios (such as
   /// avoiding register spilling), it may be beneficial to consider scheduling
   /// these not-yet-ready instructions.
-  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                           SchedBoundary *Zone) const;
+  virtual bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                                   SchedBoundary *Zone) const;
 
   void printCandidateDecision(const SchedCandidate &Current,
                               const SchedCandidate &Preferred);
diff --git a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
index bb82c7364d0ff..0a04e7964e63c 100644
--- a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
@@ -12,6 +12,8 @@
   attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
 ...
 
+# The scheduler should reorder the use of the global load after WMMAs to hide memory latency.
+
 ---
 name: with_ml_workload_attr
 tracksRegLiveness: true
@@ -31,8 +33,8 @@ body: |
     ; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
     ; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
     ;
     ; ML-LABEL: name: with_ml_workload_attr
@@ -49,8 +51,8 @@ body: |
     ; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
     ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
     %0:vreg_512_align2 = IMPLICIT_DEF
     %1:vreg_512_align2 = IMPLICIT_DEF
@@ -108,8 +110,8 @@ body: |
     ; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
     ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
     %0:vreg_512_align2 = IMPLICIT_DEF
     %1:vreg_512_align2 = IMPLICIT_DEF