Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

#include "AMDGPUMLSchedStrategy.h"

#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "machine-scheduler"

using namespace llvm;

AMDGPUMLSchedStrategy::AMDGPUMLSchedStrategy(const MachineSchedContext *C)
Expand Down Expand Up @@ -130,6 +134,74 @@ bool AMDGPUMLSchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}

bool AMDGPUMLSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}

// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;

// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;

// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;

bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// Compare effective stall cycles between candidates.
// Effective stall = max(structural stall, latency stall)
// - Structural stalls: resource/hazard constraints (HW not ready)
// - Latency stalls: data dependency constraints (operands not ready)
//
// This allows picking a pending instruction with structural stalls over
// an available instruction with higher latency stalls (e.g., scheduling
// a WMMA while waiting for a memory load result).
unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU);
unsigned TryLatencyStall = Zone->getLatencyStallCycles(TryCand.SU);
unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall);

unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU);
unsigned CandLatencyStall = Zone->getLatencyStallCycles(Cand.SU);
unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall);

LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
dbgs() << "Effective stalls: try=" << TryEffectiveStall
<< " (struct=" << TryStructStall << ", lat=" << TryLatencyStall
<< ") cand=" << CandEffectiveStall
<< " (struct=" << CandStructStall << ", lat=" << CandLatencyStall
<< ")\n";
});

if (tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall))
return TryCand.Reason != NoCand;

TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
}

return false;
}

AMDGPUMLPostSchedStrategy::AMDGPUMLPostSchedStrategy(
const MachineSchedContext *C)
: PostGenericScheduler(C) {}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class AMDGPUMLSchedStrategy final : public GCNSchedStrategy {
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;

bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;

public:
AMDGPUMLSchedStrategy(const MachineSchedContext *C);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
return std::max(W, NopPadding.getValue());
}

unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
return const_cast<GCNHazardRecognizer *>(this)->PreEmitNoopsCommon(MI);
}

unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (MI->isBundle())
return 0;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
void EmitInstruction(SUnit *SU) override;
void EmitInstruction(MachineInstr *MI) override;
HazardType getHazardType(SUnit *SU, int Stalls) override;

/// Returns the number of wait states until all hazards for \p MI are
/// resolved. This is useful for scheduling heuristics that want
/// cycle-accurate hazard information rather than just a boolean. Unlike
/// PreEmitNoops, this does not modify state or fix hazards.
unsigned getHazardWaitStates(MachineInstr *MI) const;
void EmitNoop() override;
unsigned PreEmitNoops(MachineInstr *) override;
unsigned PreEmitNoopsCommon(MachineInstr *);
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
#include "GCNHazardRecognizer.h"
#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
Expand Down Expand Up @@ -218,6 +219,40 @@ void GCNSchedStrategy::getRegisterPressures(
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}

unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
SUnit *SU) const {
// Only implemented for top-down scheduling currently.
if (!Zone.isTop() || !SU)
return 0;

MachineInstr *MI = SU->getInstr();
unsigned CurrCycle = Zone.getCurrCycle();
unsigned Stall = 0;

// Query SchedModel for resource stalls (unbuffered resources).
if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
for (const MCWriteProcResEntry &PE :
make_range(SchedModel->getWriteProcResBegin(SC),
SchedModel->getWriteProcResEnd(SC))) {
unsigned NextAvail =
Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
PE.AcquireAtCycle)
.first;
if (NextAvail > CurrCycle)
Stall = std::max(Stall, NextAvail - CurrCycle);
}
}

// Query HazardRecognizer for sequence-dependent hazard penalties.
if (Zone.HazardRec && Zone.HazardRec->isEnabled()) {
auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
Stall = std::max(Stall, HR->getHazardWaitStates(MI));
}

return Stall;
}

void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class GCNSchedStrategy : public GenericScheduler {
const SIRegisterInfo *SRI, unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp);

/// Estimate how many cycles \p SU must wait due to structural hazards at the
/// current boundary cycle. Returns zero when no stall is required.
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const;

/// Evaluates instructions in the pending queue using a subset of scheduling
/// heuristics.
///
Expand All @@ -64,8 +68,8 @@ class GCNSchedStrategy : public GenericScheduler {
/// invisible to scheduling heuristics. However, in certain scenarios (such as
/// avoiding register spilling), it may be beneficial to consider scheduling
/// these not-yet-ready instructions.
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const;
virtual bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const;

void printCandidateDecision(const SchedCandidate &Current,
const SchedCandidate &Preferred);
Expand Down
8 changes: 5 additions & 3 deletions llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
...

# The scheduler should reorder the use of the global load after WMMAs to hide memory latency.

---
name: with_ml_workload_attr
tracksRegLiveness: true
Expand All @@ -31,8 +33,8 @@ body: |
; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
;
; ML-LABEL: name: with_ml_workload_attr
Expand All @@ -49,8 +51,8 @@ body: |
; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
%0:vreg_512_align2 = IMPLICIT_DEF
%1:vreg_512_align2 = IMPLICIT_DEF
Expand Down Expand Up @@ -108,8 +110,8 @@ body: |
; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
%0:vreg_512_align2 = IMPLICIT_DEF
%1:vreg_512_align2 = IMPLICIT_DEF
Expand Down
Loading