diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 140b79136227c..777491fb58b80 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -466,6 +466,8 @@ class SIInsertWaitcnts { // Set to true when relaxation is actually applied in the loop body. // Used to determine if preheader needs DS_CNT flush. mutable bool RelaxationApplied = false; + // Pointer to the last barrier in the loop (found during eligibility check) + const MachineInstr *LastBarrier = nullptr; }; // Cache of loop DS wait optimization info, keyed by loop header MBB. @@ -600,6 +602,9 @@ class SIInsertWaitcnts { // DS loop wait optimization functions bool isEligibleForDSLoopOpt(MachineLoop *ML, LoopDSWaitOptInfo &Info) const; void analyzeSingleBBLoopDSLoads(MachineLoop *ML); + std::optional getOptimalDSWaitCount(MachineBasicBlock *LoopHeader, + const MachineInstr &MI) const; + bool applyDSLoopWaitOpt(MachineInstr &MI, AMDGPU::Waitcnt &Wait); }; // This objects maintains the current score brackets of each wait counter, and @@ -2138,6 +2143,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Verify that the wait is actually needed. ScoreBrackets.simplifyWaitcnt(Wait); + // DS Loop Wait Optimization (GFX12+): + // Try to relax conservative DS wait counts in single-block loops with WMMA. + applyDSLoopWaitOpt(MI, Wait); + // Since the translation for VMEM addresses occur in-order, we can apply the // XCnt if the current instruction is of VMEM type and has a memory // dependency with another VMEM instruction in flight. @@ -2701,9 +2710,11 @@ bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML, MachineBasicBlock *MBB = ML->getHeader(); // Count DS loads, WMMA/MFMA instructions, and total non-meta instructions + // Also find the last barrier during this traversal to avoid re-traversing unsigned NumDSLoads = 0; unsigned NumWMMA = 0; unsigned NumInsts = 0; + Info.LastBarrier = nullptr; for (const MachineInstr &MI : *MBB) { if (!MI.isMetaInstruction()) @@ -2715,6 +2726,13 @@ bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML, } else if (SIInstrInfo::isWMMA(MI) || SIInstrInfo::isMFMA(MI)) { ++NumWMMA; } + + // Track the last barrier instruction + if (MI.getOpcode() == AMDGPU::S_BARRIER || + MI.getOpcode() == AMDGPU::S_BARRIER_SIGNAL_IMM || + MI.getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM) { + Info.LastBarrier = &MI; + } } // Heuristics: need significant number of DS loads and WMMA/MFMA @@ -2745,8 +2763,145 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) { return; } - // For now, just mark as invalid - full analysis comes in a later PR. - Info.Valid = false; + // Looking for something similar to software-pipelined GEMM loops, + // where the last part of the loop body is prefetching data for the next + // iteration. Such code also has loads in the preheader block whose ordering + // may be significantly different from the load ordering at the end of the + // loop body since their orderings are not co-optimized. That can end up in + // rather conservative LDS wait counts. + + // We only care about the LDS loads after the last barrier in the loop body, + // if one exists. LastBarrier was already found during eligibility check. + // These are likely to be prefetch loads whose results are used in the next + // iteration. + const MachineInstr *LastBarrier = Info.LastBarrier; + + unsigned LoadPosition = 0; + bool AfterLastBarrier = (LastBarrier == nullptr); // If no barrier, track all + + for (const MachineInstr &MI : *MBB) { + if (&MI == LastBarrier) { + AfterLastBarrier = true; + continue; + } + + if (!AfterLastBarrier) + continue; + + // Check DS instructions + if (SIInstrInfo::isDS(MI)) { + // DS stores after barrier not allowed - same counter, may complete + // out of order with loads + if (MI.mayStore()) { + LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS store after last barrier, " + << "skipping\n"); + Info.Valid = false; + return; + } + + // Track DS loads - record position + if (MI.mayLoad()) { + ++LoadPosition; + for (const MachineOperand &Op : MI.defs()) { + if (Op.isReg() && Op.getReg().isPhysical() && + TRI->isVGPR(*MRI, Op.getReg())) { + for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { + Info.VGPRToLoadPosition[static_cast(Unit)] = + LoadPosition; + } + } + } + } + } + } + + Info.TotalDSLoads = LoadPosition; + Info.Valid = Info.TotalDSLoads > 0; + + LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at "; + MBB->printName(dbgs()); + dbgs() << " - " << Info.TotalDSLoads << " DS loads" + << ", HasBarrier=" << (LastBarrier != nullptr) + << ", Valid=" << Info.Valid << "\n"); +} + +std::optional +SIInsertWaitcnts::getOptimalDSWaitCount(MachineBasicBlock *LoopHeader, + const MachineInstr &MI) const { + auto It = LoopDSWaitOptCache.find(LoopHeader); + if (It == LoopDSWaitOptCache.end() || !It->second.Valid) + return std::nullopt; + + const LoopDSWaitOptInfo &Info = It->second; + + // Find the maximum load position among all VGPR operands used by MI + unsigned MaxLoadPosition = 0; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.isUse() || !Op.getReg().isPhysical()) + continue; + if (!TRI->isVGPR(*MRI, Op.getReg())) + continue; + + for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { + auto PosIt = Info.VGPRToLoadPosition.find(static_cast(Unit)); + if (PosIt != Info.VGPRToLoadPosition.end()) { + MaxLoadPosition = std::max(MaxLoadPosition, PosIt->second); + } + } + } + + if (MaxLoadPosition == 0) + return std::nullopt; + + // Optimal wait = TotalDSLoads - MaxLoadPosition + // This means we wait until all loads up to and including MaxLoadPosition + // have completed, but loads after it can still be in flight. + return Info.TotalDSLoads - MaxLoadPosition; +} + +// Try to apply DS loop wait optimization to relax conservative wait counts. +// Returns true if the wait count was modified. +bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr &MI, + AMDGPU::Waitcnt &Wait) { + // Only applies to GFX12+ with separate DS counter + if (!ST->hasExtendedWaitCounts()) + return false; + + // Only optimize if baseline wants a DS wait + if (Wait.DsCnt == ~0u) + return false; + + MachineBasicBlock *MBB = MI.getParent(); + MachineLoop *ML = MLI->getLoopFor(MBB); + + // Only apply in single-block loop headers + if (!ML || ML->getNumBlocks() != 1 || ML->getHeader() != MBB) + return false; + + auto CacheIt = LoopDSWaitOptCache.find(MBB); + if (CacheIt == LoopDSWaitOptCache.end() || !CacheIt->second.Valid) + return false; + + // Only optimize if wait is conservative (less than half of loads in flight) + unsigned HalfLoads = CacheIt->second.TotalDSLoads / 2; + if (Wait.DsCnt >= HalfLoads) + return false; + + auto OptWait = getOptimalDSWaitCount(MBB, MI); + if (!OptWait) + return false; + + // Only relax the wait (increase the count), never tighten it + // and only when the relaxation is significant (at least 4 more) + if (*OptWait <= Wait.DsCnt || (*OptWait - Wait.DsCnt) < 4) + return false; + + LLVM_DEBUG(dbgs() << "DS Loop Opt: Relaxing DsCnt from " << Wait.DsCnt + << " to " << *OptWait << " for: " << MI); + Wait.DsCnt = *OptWait; + // Mark that relaxation was applied so preheader flush is inserted + CacheIt->second.RelaxationApplied = true; + return true; } // Return true if it is better to flush the vmcnt counter in the preheader of diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir index d7d2cf96ceac5..48fdabf255e6f 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir @@ -1,24 +1,44 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s + +# Debug output test (requires asserts build) +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -o /dev/null %s 2>&1 | FileCheck -check-prefix=DBG %s # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -o /dev/null %s 2>&1 | FileCheck %s -# Test for DS loop wait optimization eligibility check. -# Verifies that the pass correctly identifies single-block loops with -# sufficient DS loads (>=16) and WMMA instructions (>=8) as eligible -# for optimization. +# Test for DS loop wait optimization in single-block loops with WMMA. +# The preheader DS loads are in reverse order compared to loop body loads, +# which causes the baseline to produce conservative waits that the optimization +# can relax. +# +# Key improvement demonstrated: +# Without opt: S_WAIT_DSCNT 0 (wait for ALL 16 loads) before first WMMA +# With opt: S_WAIT_DSCNT 12 (wait for only 4 loads, 12 remain in flight) # -# CHECK: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible +# DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible +# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1 +# DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for: --- | define amdgpu_kernel void @ds_loop_eligible() { ret void } ... --- +# OPT-LABEL: name: ds_loop_eligible +# NOOPT-LABEL: name: ds_loop_eligible name: ds_loop_eligible tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true waveLimiter: false body: | + ; OPT: bb.0: + ; OPT-NOT: S_WAIT_DSCNT + ; OPT: S_BRANCH %bb.1 + + ; NOOPT: bb.0: + ; NOOPT-NOT: S_WAIT_DSCNT + ; NOOPT: S_BRANCH %bb.1 + bb.0: successors: %bb.1 liveins: $sgpr0, $vgpr0 @@ -44,6 +64,17 @@ body: | $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec S_BRANCH %bb.1 + ; OPT: bb.1: + ; OPT: S_WAIT_DSCNT 12 + ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + ; OPT: S_WAIT_DSCNT 8 + ; OPT-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA + + ; NOOPT: bb.1: + ; NOOPT: S_WAIT_DSCNT 0 + ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + ; NOOPT-NOT: S_WAIT_DSCNT 8 + bb.1: ; Single-block loop with WMMA and DS loads after barrier successors: %bb.1, %bb.2