llvm · PankajDwivedi-25 · Nov 25, 2025 · Nov 26, 2025 · Nov 26, 2025 · Dec 2, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
     cl::desc("Force all waitcnt load counters to wait until 0"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> ExpandWaitcntProfiling(
+    "amdgpu-expand-waitcnt-profiling",
+    cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
+    cl::Hidden);
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -297,6 +302,34 @@ class WaitcntGenerator {
   // optimization.
   bool isOptNone() const { return OptNone; }
 
+  // Get the maximum wait count value for a given counter type.
+  // For pre-GFX12, LOAD_CNT uses vmcnt and DS_CNT uses lgkmcnt.
+  // For GFX12+, LOAD_CNT uses loadcnt and DS_CNT uses dscnt.
+  unsigned getWaitCountMax(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return ST->hasExtendedWaitCounts() ? AMDGPU::getLoadcntBitMask(IV)
+                                         : AMDGPU::getVmcntBitMask(IV);
+    case DS_CNT:
+      return ST->hasExtendedWaitCounts() ? AMDGPU::getDscntBitMask(IV)
+                                         : AMDGPU::getLgkmcntBitMask(IV);
+    case EXP_CNT:
+      return AMDGPU::getExpcntBitMask(IV);
+    case STORE_CNT:
+      return AMDGPU::getStorecntBitMask(IV);
+    case SAMPLE_CNT:
+      return AMDGPU::getSamplecntBitMask(IV);
+    case BVH_CNT:
+      return AMDGPU::getBvhcntBitMask(IV);
+    case KM_CNT:
+      return AMDGPU::getKmcntBitMask(IV);
+    case X_CNT:
+      return AMDGPU::getXcntBitMask(IV);
+    default:
+      return 0;
+    }
+  }
+
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
   // any new wait count instructions which may need to be generated by
@@ -318,9 +351,11 @@ class WaitcntGenerator {
 
   // Generates new wait count instructions according to the  value of
   // Wait, returning true if any new instructions were created.
+  // If ScoreBrackets is provided, it can be used for profiling expansion.
   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
                                 MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait) = 0;
+                                AMDGPU::Waitcnt Wait,
+                                WaitcntBrackets *ScoreBrackets = nullptr) = 0;
 
   // Returns an array of bit masks which can be used to map values in
   // WaitEventType to corresponding counter values in InstCounterType.
@@ -356,7 +391,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -393,7 +429,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -1523,38 +1560,108 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 /// required counters in \p Wait
 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
+  // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
+  auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    if (Outstanding > Target) {
+      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+        EmitWaitcnt(i);
+        Modified = true;
+      }
+    } else {
+      EmitWaitcnt(Target);
+      Modified = true;
+    }
+  };
+
   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
   // single instruction while VScnt has its own instruction.
   if (Wait.hasWaitExceptStoreCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
+    // If profiling expansion is enabled and we have score brackets,
+    // emit an expanded sequence
+    if (ExpandWaitcntProfiling && ScoreBrackets) {
+      // Check if any of the counters to be waited on are out-of-order.
+      // If so, fall back to normal (non-expanded) behavior since expansion
+      // would provide misleading profiling information.
+      bool AnyOutOfOrder = false;
+      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+        unsigned &WaitCnt = getCounterRef(Wait, CT);
+        if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
+          AnyOutOfOrder = true;
+          break;
+        }
+      }
+
+      if (AnyOutOfOrder) {
+        // Fall back to non-expanded wait
+        unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    Modified = true;
+        Modified = true;
+      } else {
+        // All counters are in-order, safe to expand
+        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+          unsigned &WaitCnt = getCounterRef(Wait, CT);
+          if (WaitCnt == ~0u)
+            continue;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+          unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+                                              ScoreBrackets->getScoreLB(CT),
+                                          getWaitCountMax(CT) - 1);
+          emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+            AMDGPU::Waitcnt W;
+            getCounterRef(W, CT) = Count;
+            BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+                .addImm(AMDGPU::encodeWaitcnt(IV, W));
+          });
+        }
+      }
+    } else {
+      // Normal behavior: emit single combined waitcnt
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+      Modified = true;
+
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   if (Wait.hasWaitStoreCnt()) {
     assert(ST->hasVscnt());
 
-    [[maybe_unused]] auto SWaitInst =
+    if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
+        !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
+      // Only expand if counter is not out-of-order
+      unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
+                                          ScoreBrackets->getScoreLB(STORE_CNT),
+                                      getWaitCountMax(STORE_CNT) - 1);
+      emitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.StoreCnt);
-    Modified = true;
+            .addImm(Count);
+      });
+    } else {
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+              .addImm(Wait.StoreCnt);
+      Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   return Modified;
@@ -1777,13 +1884,55 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    if (Outstanding > Target) {
+      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+        EmitWaitcnt(i);
+        Modified = true;
+      }
+    } else {
+      EmitWaitcnt(Target);
+      Modified = true;
+    }
+  };
+
+  // For GFX12+, we use separate wait instructions, which makes expansion
+  // simpler
+  if (ExpandWaitcntProfiling && ScoreBrackets) {
+    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+      unsigned Count = getWait(Wait, CT);
+      if (Count == ~0u)
+        continue;
+
+      // Skip expansion for out-of-order counters - emit normal wait instead
+      if (ScoreBrackets->counterOutOfOrder(CT)) {
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Count);
+        Modified = true;
+        continue;
+      }
+
+      unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+                                          ScoreBrackets->getScoreLB(CT),
+                                      getWaitCountMax(CT) - 1);
+      emitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Val);
+      });
+    }
+    return Modified;
+  }
+
+  // Normal behavior (no expansion)
   // Check for opportunities to use combined wait instructions.
   if (Wait.DsCnt != ~0u) {
     MachineInstr *SWaitInst = nullptr;
@@ -2141,9 +2290,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
 
-  // Any counts that could have been applied to any existing waitcnt
-  // instructions will have been done so, now deal with any remaining.
-  ScoreBrackets.applyWaitcnt(Wait);
+  AMDGPU::Waitcnt WaitForScore = Wait;
 
   // ExpCnt can be merged into VINTERP.
   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2161,23 +2308,27 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   }
 
   // XCnt may be already consumed by a load wait.
+  // When we wait for KmCnt==0, all SMEM operations (including address
+  // translations) are complete, so XCNT wait is redundant. When we wait for
+  // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address
+  // translations are complete (since XCnt follows LoadCnt for loads). When the
+  // current instruction is a VMEM access, translations are in-order.
   if (Wait.XCnt != ~0u) {
-    if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+    if (Wait.KmCnt == 0)
       Wait.XCnt = ~0u;
-
-    if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+    else if (Wait.LoadCnt == 0 && Wait.XCnt == 0)
       Wait.XCnt = ~0u;
-
-    // Since the translation for VMEM addresses occur in-order, we can skip the
-    // XCnt if the current instruction is of VMEM type and has a memory
-    // dependency with another VMEM instruction in flight.
-    if (isVmemAccess(*It))
+    else if (isVmemAccess(*It))
       Wait.XCnt = ~0u;
   }
 
-  if (WCG->createNewWaitcnt(Block, It, Wait))
+  if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
     Modified = true;
 
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(WaitForScore);
+
   return Modified;
 }