Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 157 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,8 @@ class SIInsertWaitcnts {
// Set to true when relaxation is actually applied in the loop body.
// Used to determine if preheader needs DS_CNT flush.
mutable bool RelaxationApplied = false;
// Pointer to the last barrier in the loop (found during eligibility check)
const MachineInstr *LastBarrier = nullptr;
};

// Cache of loop DS wait optimization info, keyed by loop header MBB.
Expand Down Expand Up @@ -600,6 +602,9 @@ class SIInsertWaitcnts {
// DS loop wait optimization functions
bool isEligibleForDSLoopOpt(MachineLoop *ML, LoopDSWaitOptInfo &Info) const;
void analyzeSingleBBLoopDSLoads(MachineLoop *ML);
std::optional<unsigned> getOptimalDSWaitCount(MachineBasicBlock *LoopHeader,
const MachineInstr &MI) const;
bool applyDSLoopWaitOpt(MachineInstr &MI, AMDGPU::Waitcnt &Wait);
};

// This objects maintains the current score brackets of each wait counter, and
Expand Down Expand Up @@ -2138,6 +2143,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);

// DS Loop Wait Optimization (GFX12+):
// Try to relax conservative DS wait counts in single-block loops with WMMA.
applyDSLoopWaitOpt(MI, Wait);

// Since the translation for VMEM addresses occur in-order, we can apply the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
Expand Down Expand Up @@ -2701,9 +2710,11 @@ bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML,
MachineBasicBlock *MBB = ML->getHeader();

// Count DS loads, WMMA/MFMA instructions, and total non-meta instructions
// Also find the last barrier during this traversal to avoid re-traversing
unsigned NumDSLoads = 0;
unsigned NumWMMA = 0;
unsigned NumInsts = 0;
Info.LastBarrier = nullptr;

for (const MachineInstr &MI : *MBB) {
if (!MI.isMetaInstruction())
Expand All @@ -2715,6 +2726,13 @@ bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML,
} else if (SIInstrInfo::isWMMA(MI) || SIInstrInfo::isMFMA(MI)) {
++NumWMMA;
}

// Track the last barrier instruction
if (MI.getOpcode() == AMDGPU::S_BARRIER ||
MI.getOpcode() == AMDGPU::S_BARRIER_SIGNAL_IMM ||
MI.getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM) {
Info.LastBarrier = &MI;
}
}

// Heuristics: need significant number of DS loads and WMMA/MFMA
Expand Down Expand Up @@ -2745,8 +2763,145 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
return;
}

// For now, just mark as invalid - full analysis comes in a later PR.
Info.Valid = false;
// Looking for something similar to software-pipelined GEMM loops,
// where the last part of the loop body is prefetching data for the next
// iteration. Such code also has loads in the preheader block whose ordering
// may be significantly different from the load ordering at the end of the
// loop body since their orderings are not co-optimized. That can end up in
// rather conservative LDS wait counts.

// We only care about the LDS loads after the last barrier in the loop body,
// if one exists. LastBarrier was already found during eligibility check.
// These are likely to be prefetch loads whose results are used in the next
// iteration.
const MachineInstr *LastBarrier = Info.LastBarrier;

unsigned LoadPosition = 0;
bool AfterLastBarrier = (LastBarrier == nullptr); // If no barrier, track all

for (const MachineInstr &MI : *MBB) {
if (&MI == LastBarrier) {
AfterLastBarrier = true;
continue;
}

if (!AfterLastBarrier)
continue;

// Check DS instructions
if (SIInstrInfo::isDS(MI)) {
// DS stores after barrier not allowed - same counter, may complete
// out of order with loads
if (MI.mayStore()) {
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS store after last barrier, "
<< "skipping\n");
Comment on lines +2796 to +2797
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS store after last barrier, "
<< "skipping\n");
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS store after last barrier, "
"skipping\n");

Info.Valid = false;
return;
}

// Track DS loads - record position
if (MI.mayLoad()) {
++LoadPosition;
for (const MachineOperand &Op : MI.defs()) {
if (Op.isReg() && Op.getReg().isPhysical() &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All registers must be physical at this point

TRI->isVGPR(*MRI, Op.getReg())) {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
Info.VGPRToLoadPosition[static_cast<unsigned>(Unit)] =
LoadPosition;
}
}
}
}
}
}

Info.TotalDSLoads = LoadPosition;
Info.Valid = Info.TotalDSLoads > 0;

LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at ";
MBB->printName(dbgs());
dbgs() << " - " << Info.TotalDSLoads << " DS loads"
<< ", HasBarrier=" << (LastBarrier != nullptr)
<< ", Valid=" << Info.Valid << "\n");
Comment on lines +2821 to +2825
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at ";
MBB->printName(dbgs());
dbgs() << " - " << Info.TotalDSLoads << " DS loads"
<< ", HasBarrier=" << (LastBarrier != nullptr)
<< ", Valid=" << Info.Valid << "\n");
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at "
<< printMBBReference(MBB) << " - " << Info.TotalDSLoads << " DS loads"
<< ", HasBarrier=" << (LastBarrier != nullptr)
<< ", Valid=" << Info.Valid << '\n');

}

std::optional<unsigned>
SIInsertWaitcnts::getOptimalDSWaitCount(MachineBasicBlock *LoopHeader,
const MachineInstr &MI) const {
auto It = LoopDSWaitOptCache.find(LoopHeader);
if (It == LoopDSWaitOptCache.end() || !It->second.Valid)
return std::nullopt;

const LoopDSWaitOptInfo &Info = It->second;

// Find the maximum load position among all VGPR operands used by MI
unsigned MaxLoadPosition = 0;
for (const MachineOperand &Op : MI.operands()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (const MachineOperand &Op : MI.operands()) {
for (const MachineOperand &Op : MI.all_uses()) {

if (!Op.isReg() || !Op.isUse() || !Op.getReg().isPhysical())
continue;
if (!TRI->isVGPR(*MRI, Op.getReg()))
continue;

for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto PosIt = Info.VGPRToLoadPosition.find(static_cast<unsigned>(Unit));
if (PosIt != Info.VGPRToLoadPosition.end()) {
MaxLoadPosition = std::max(MaxLoadPosition, PosIt->second);
}
}
}

if (MaxLoadPosition == 0)
return std::nullopt;

// Optimal wait = TotalDSLoads - MaxLoadPosition
// This means we wait until all loads up to and including MaxLoadPosition
// have completed, but loads after it can still be in flight.
return Info.TotalDSLoads - MaxLoadPosition;
}

// Try to apply DS loop wait optimization to relax conservative wait counts.
// Returns true if the wait count was modified.
bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr &MI,
AMDGPU::Waitcnt &Wait) {
// Only applies to GFX12+ with separate DS counter
if (!ST->hasExtendedWaitCounts())
return false;

// Only optimize if baseline wants a DS wait
if (Wait.DsCnt == ~0u)
return false;

MachineBasicBlock *MBB = MI.getParent();
MachineLoop *ML = MLI->getLoopFor(MBB);

// Only apply in single-block loop headers
if (!ML || ML->getNumBlocks() != 1 || ML->getHeader() != MBB)
return false;

auto CacheIt = LoopDSWaitOptCache.find(MBB);
if (CacheIt == LoopDSWaitOptCache.end() || !CacheIt->second.Valid)
return false;

// Only optimize if wait is conservative (less than half of loads in flight)
unsigned HalfLoads = CacheIt->second.TotalDSLoads / 2;
if (Wait.DsCnt >= HalfLoads)
return false;

auto OptWait = getOptimalDSWaitCount(MBB, MI);
if (!OptWait)
return false;

// Only relax the wait (increase the count), never tighten it
// and only when the relaxation is significant (at least 4 more)
if (*OptWait <= Wait.DsCnt || (*OptWait - Wait.DsCnt) < 4)
return false;

LLVM_DEBUG(dbgs() << "DS Loop Opt: Relaxing DsCnt from " << Wait.DsCnt
<< " to " << *OptWait << " for: " << MI);
Wait.DsCnt = *OptWait;
// Mark that relaxation was applied so preheader flush is inserted
CacheIt->second.RelaxationApplied = true;
return true;
}

// Return true if it is better to flush the vmcnt counter in the preheader of
Expand Down
43 changes: 37 additions & 6 deletions llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
Original file line number Diff line number Diff line change
@@ -1,24 +1,44 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s

# Debug output test (requires asserts build)
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -o /dev/null %s 2>&1 | FileCheck -check-prefix=DBG %s
# REQUIRES: asserts
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -o /dev/null %s 2>&1 | FileCheck %s

# Test for DS loop wait optimization eligibility check.
# Verifies that the pass correctly identifies single-block loops with
# sufficient DS loads (>=16) and WMMA instructions (>=8) as eligible
# for optimization.
# Test for DS loop wait optimization in single-block loops with WMMA.
# The preheader DS loads are in reverse order compared to loop body loads,
# which causes the baseline to produce conservative waits that the optimization
# can relax.
#
# Key improvement demonstrated:
# Without opt: S_WAIT_DSCNT 0 (wait for ALL 16 loads) before first WMMA
# With opt: S_WAIT_DSCNT 12 (wait for only 4 loads, 12 remain in flight)
#
# CHECK: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible
# DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible
# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1
# DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for:

--- |
define amdgpu_kernel void @ds_loop_eligible() { ret void }
...

---
# OPT-LABEL: name: ds_loop_eligible
# NOOPT-LABEL: name: ds_loop_eligible
name: ds_loop_eligible
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
waveLimiter: false
body: |
; OPT: bb.0:
; OPT-NOT: S_WAIT_DSCNT
; OPT: S_BRANCH %bb.1

; NOOPT: bb.0:
; NOOPT-NOT: S_WAIT_DSCNT
; NOOPT: S_BRANCH %bb.1

bb.0:
successors: %bb.1
liveins: $sgpr0, $vgpr0
Expand All @@ -44,6 +64,17 @@ body: |
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
S_BRANCH %bb.1

; OPT: bb.1:
; OPT: S_WAIT_DSCNT 12
; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
; OPT: S_WAIT_DSCNT 8
; OPT-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA

; NOOPT: bb.1:
; NOOPT: S_WAIT_DSCNT 0
; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
; NOOPT-NOT: S_WAIT_DSCNT 8

bb.1:
; Single-block loop with WMMA and DS loads after barrier
successors: %bb.1, %bb.2
Expand Down