-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] Add DS loop wait optimization infrastructure (1/4) #171942
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Add the infrastructure for DS wait count optimization in single-block loops with WMMA instructions (GFX12+). This patch adds the loop eligibility check. This is the first of 4 patches to split the DS loop wait optimization. Subsequent patches will add: - DS load position analysis - Wait count relaxation - Preheader flush and edge case handling Assisted-by: Cursor / claude-4.5-opus-high
|
@llvm/pr-subscribers-backend-amdgpu Author: None (hidekisaito) ChangesAdd the infrastructure for DS wait count optimization in single-block loops with WMMA instructions (GFX12+). This patch adds the loop eligibility check. This is the first of 4 patches to split the DS loop wait optimization. Subsequent patches will add:
Assisted-by: Cursor / claude-4.5-opus-high Full diff: https://github.com/llvm/llvm-project/pull/171942.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 146f3604d9f8f..140b79136227c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,12 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> OptimizeDSLoopWaitcnt(
+ "amdgpu-waitcnt-loop-ds-opt",
+ cl::desc(
+ "Optimize DS wait counts in single-block loops with WMMA (GFX12+)"),
+ cl::init(false), cl::Hidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -448,6 +454,23 @@ class SIInsertWaitcnts {
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ // Single-block loop DS wait optimization (GFX12+)
+ // This optimization relaxes DS wait counts in loops with many DS loads and
+ // WMMA instructions, allowing more overlap between memory and compute.
+ struct LoopDSWaitOptInfo {
+ // Maps VGPR number to the position (1-based) of the DS load that writes it.
+ // Position 1 = first DS load in sequence, etc.
+ DenseMap<unsigned, unsigned> VGPRToLoadPosition;
+ unsigned TotalDSLoads = 0;
+ bool Valid = false;
+ // Set to true when relaxation is actually applied in the loop body.
+ // Used to determine if preheader needs DS_CNT flush.
+ mutable bool RelaxationApplied = false;
+ };
+
+ // Cache of loop DS wait optimization info, keyed by loop header MBB.
+ DenseMap<MachineBasicBlock *, LoopDSWaitOptInfo> LoopDSWaitOptCache;
+
HardwareLimits Limits;
public:
@@ -573,6 +596,10 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+
+ // DS loop wait optimization functions
+ bool isEligibleForDSLoopOpt(MachineLoop *ML, LoopDSWaitOptInfo &Info) const;
+ void analyzeSingleBBLoopDSLoads(MachineLoop *ML);
};
// This objects maintains the current score brackets of each wait counter, and
@@ -2643,6 +2670,85 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
return SIInstrInfo::isVMEM(MI);
}
+//===----------------------------------------------------------------------===//
+// DS Loop Wait Optimization (GFX12+)
+//
+// This optimization relaxes DS wait counts in single-block loops that have
+// many DS loads and WMMA/MFMA instructions (typical GEMM kernels with software
+// pipelining). Instead of waiting for almost all DS loads to complete before
+// each WMMA, we analyze which specific loads feed each WMMA and wait only for
+// those to complete, allowing more overlap between memory and compute.
+//
+// Opportunity arises when the load ordering in the preheader block and
+// the load ordering at the end of the loop body, feeding the loaded data
+// to the next iteration, are not matched well (since their orderings are
+// not co-optimized)
+//===----------------------------------------------------------------------===//
+
+bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML,
+ LoopDSWaitOptInfo &Info) const {
+ if (!OptimizeDSLoopWaitcnt)
+ return false;
+
+ // Only for GFX12+ where we have a separate counter for LDS.
+ if (!ST->hasExtendedWaitCounts())
+ return false;
+
+ // Must be a single-block loop. Makes the analysis easier.
+ if (ML->getNumBlocks() != 1)
+ return false;
+
+ MachineBasicBlock *MBB = ML->getHeader();
+
+ // Count DS loads, WMMA/MFMA instructions, and total non-meta instructions
+ unsigned NumDSLoads = 0;
+ unsigned NumWMMA = 0;
+ unsigned NumInsts = 0;
+
+ for (const MachineInstr &MI : *MBB) {
+ if (!MI.isMetaInstruction())
+ ++NumInsts;
+
+ if (SIInstrInfo::isDS(MI)) {
+ if (MI.mayLoad() && !MI.mayStore())
+ ++NumDSLoads;
+ } else if (SIInstrInfo::isWMMA(MI) || SIInstrInfo::isMFMA(MI)) {
+ ++NumWMMA;
+ }
+ }
+
+ // Heuristics: need significant number of DS loads and WMMA/MFMA
+ // to make this optimization worthwhile
+ if (NumDSLoads < 16 || NumWMMA < 8)
+ return false;
+
+ // DS loads and WMMAs should be a significant portion of the loop body
+ // (at least 1/4 of the instructions)
+ if ((NumDSLoads + NumWMMA) * 4 < NumInsts)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Loop at "; MBB->printName(dbgs());
+ dbgs() << " - " << NumDSLoads << " DS loads, " << NumWMMA
+ << " WMMA/MFMA, " << NumInsts
+ << " total insts, eligible\n");
+
+ return true;
+}
+
+void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
+ MachineBasicBlock *MBB = ML->getHeader();
+ LoopDSWaitOptInfo &Info = LoopDSWaitOptCache[MBB];
+
+ // Quick structural checks
+ if (!isEligibleForDSLoopOpt(ML, Info)) {
+ Info.Valid = false;
+ return;
+ }
+
+ // For now, just mark as invalid - full analysis comes in a later PR.
+ Info.Valid = false;
+}
+
// Return true if it is better to flush the vmcnt counter in the preheader of
// the given loop. We currently decide to flush in two situations:
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
@@ -2786,6 +2892,22 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
BlockInfos.clear();
+ LoopDSWaitOptCache.clear();
+
+ // Analyze single-block loops for DS wait optimization (GFX12+)
+ if (OptimizeDSLoopWaitcnt && ST->hasExtendedWaitCounts()) {
+ SmallVector<MachineLoop *, 4> Worklist(MLI->begin(), MLI->end());
+ while (!Worklist.empty()) {
+ MachineLoop *ML = Worklist.pop_back_val();
+ auto BeginIt = ML->getSubLoops().begin();
+ auto EndIt = ML->getSubLoops().end();
+ if (BeginIt == EndIt) // innermost loop only
+ analyzeSingleBBLoopDSLoads(ML);
+ else
+ Worklist.append(BeginIt, EndIt);
+ }
+ }
+
bool Modified = false;
MachineBasicBlock &EntryBB = MF.front();
@@ -2975,6 +3097,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
}
ReleaseVGPRInsts.clear();
PreheadersToFlush.clear();
+ LoopDSWaitOptCache.clear();
SLoadAddresses.clear();
return Modified;
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
new file mode 100644
index 0000000000000..d7d2cf96ceac5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
@@ -0,0 +1,91 @@
+# REQUIRES: asserts
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -o /dev/null %s 2>&1 | FileCheck %s
+
+# Test for DS loop wait optimization eligibility check.
+# Verifies that the pass correctly identifies single-block loops with
+# sufficient DS loads (>=16) and WMMA instructions (>=8) as eligible
+# for optimization.
+#
+# CHECK: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible
+
+--- |
+ define amdgpu_kernel void @ds_loop_eligible() { ret void }
+...
+
+---
+name: ds_loop_eligible
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ waveLimiter: false
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads in REVERSE order (last registers first)
+ ; This creates different register scores than the loop body order,
+ ; causing baseline to be conservative when merging predecessor info.
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ ; Single-block loop with WMMA and DS loads after barrier
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+
+ ; WMMA instructions (8 total, meets threshold)
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ ; Barrier separates compute from prefetch
+ S_BARRIER
+
+ ; Prefetch DS loads for next iteration (16 total, meets threshold)
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ ; Loop control
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
|
| --- | | ||
| define amdgpu_kernel void @ds_loop_eligible() { ret void } | ||
| ... | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| --- | | |
| define amdgpu_kernel void @ds_loop_eligible() { ret void } | |
| ... |
|
|
||
| // DS loads and WMMAs should be a significant portion of the loop body | ||
| // (at least 1/4 of the instructions) | ||
| if ((NumDSLoads + NumWMMA) * 4 < NumInsts) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be in terms of cycles instead of instruction count? Can you use MachineTraceMetrics (I assume this would also avoid the single block limitation)
| // Only for GFX12+ where we have a separate counter for LDS. | ||
| if (!ST->hasExtendedWaitCounts()) | ||
| return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Only for GFX12+ where we have a separate counter for LDS. | |
| if (!ST->hasExtendedWaitCounts()) | |
| return false; | |
| // Only for GFX12+ where we have a separate counter for LDS. | |
| assert(ST->hasExtendedWaitCounts()); |
The caller already checked this
| if (ML->getNumBlocks() != 1) | ||
| return false; | ||
|
|
||
| MachineBasicBlock *MBB = ML->getHeader(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| MachineBasicBlock *MBB = ML->getHeader(); | |
| const MachineBasicBlock *MBB = ML->getHeader(); |
| // Heuristics: need significant number of DS loads and WMMA/MFMA | ||
| // to make this optimization worthwhile | ||
| if (NumDSLoads < 16 || NumWMMA < 8) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Heuristics: need significant number of DS loads and WMMA/MFMA | |
| // to make this optimization worthwhile | |
| if (NumDSLoads < 16 || NumWMMA < 8) | |
| // Heuristics: Need significant number of DS loads and WMMA/MFMA | |
| // to make this optimization worthwhile | |
| if (NumDSLoads < 16 || NumWMMA < 8) |
Why these numbers?
| @@ -573,6 +596,10 @@ class SIInsertWaitcnts { | |||
| WaitcntBrackets &ScoreBrackets); | |||
| bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, | |||
| WaitcntBrackets &ScoreBrackets); | |||
|
|
|||
| // DS loop wait optimization functions | |||
| bool isEligibleForDSLoopOpt(MachineLoop *ML, LoopDSWaitOptInfo &Info) const; | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| bool isEligibleForDSLoopOpt(MachineLoop *ML, LoopDSWaitOptInfo &Info) const; | |
| bool isEligibleForDSLoopOpt(const MachineLoop &ML, LoopDSWaitOptInfo &Info) const; |
| // Opportunity arises when the load ordering in the preheader block and | ||
| // the load ordering at the end of the loop body, feeding the loaded data | ||
| // to the next iteration, are not matched well (since their orderings are | ||
| // not co-optimized) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // not co-optimized) | |
| // not co-executed) |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it clearer if I say "since machine scheduler doesn't co-schedule loads in preheader and loads in loop body"?
| LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Loop at "; MBB->printName(dbgs()); | ||
| dbgs() << " - " << NumDSLoads << " DS loads, " << NumWMMA | ||
| << " WMMA/MFMA, " << NumInsts | ||
| << " total insts, eligible\n"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Loop at "; MBB->printName(dbgs()); | |
| dbgs() << " - " << NumDSLoads << " DS loads, " << NumWMMA | |
| << " WMMA/MFMA, " << NumInsts | |
| << " total insts, eligible\n"); | |
| LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Loop at " << printMBBReference(MBB) | |
| << " - " << NumDSLoads << " DS loads, " << NumWMMA | |
| << " WMMA/MFMA, " << NumInsts | |
| << " total insts, eligible\n"); |
| Info.Valid = false; | ||
| return; | ||
| } | ||
|
|
||
| // For now, just mark as invalid - full analysis comes in a later PR. | ||
| Info.Valid = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both of these paths are just setting to the original value of false. Invert this to set Valid on isEligibleForDSLoopOpt?
| if (SIInstrInfo::isDS(MI)) { | ||
| if (MI.mayLoad() && !MI.mayStore()) | ||
| ++NumDSLoads; | ||
| } else if (SIInstrInfo::isWMMA(MI) || SIInstrInfo::isMFMA(MI)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is only for gfx12 there's no reason to consider mfma
|
I do have some high level concerns about the whole series:
|
Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mirIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
🪟 Windows x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mirIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
Add the infrastructure for DS wait count optimization in single-block loops with WMMA instructions (GFX12+). This patch adds the loop eligibility check.
This is the first of 4 patches to split the DS loop wait optimization. Subsequent patches will add:
Assisted-by: Cursor / claude-4.5-opus-high