From 48c7b23636cf18645c1bc01b3f6f367130154e4a Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Tue, 25 Nov 2025 22:57:32 +0530 Subject: [PATCH 1/5] [AMDGPU] Add -amdgpu-expand-waitcnt-profiling option for PC-sampling profiling --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 194 +++++++++++++-- .../AMDGPU/expand-waitcnt-profiling.ll | 230 ++++++++++++++++++ 2 files changed, 402 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 70db7b4918515..b86a75e9b04ed 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -63,6 +63,11 @@ static cl::opt ForceEmitZeroLoadFlag( cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden); +static cl::opt ExpandWaitcntProfiling( + "amdgpu-expand-waitcnt-profiling", + cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false), + cl::Hidden); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -297,6 +302,30 @@ class WaitcntGenerator { // optimization. bool isOptNone() const { return OptNone; } + // Get the maximum wait count value for a given counter type + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return AMDGPU::getLoadcntBitMask(IV); + case DS_CNT: + return AMDGPU::getDscntBitMask(IV); + case EXP_CNT: + return AMDGPU::getExpcntBitMask(IV); + case STORE_CNT: + return AMDGPU::getStorecntBitMask(IV); + case SAMPLE_CNT: + return AMDGPU::getSamplecntBitMask(IV); + case BVH_CNT: + return AMDGPU::getBvhcntBitMask(IV); + case KM_CNT: + return AMDGPU::getKmcntBitMask(IV); + case X_CNT: + return 0; // No hardware limit for XCNT + default: + return 0; + } + } + // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -318,9 +347,11 @@ class WaitcntGenerator { // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. + // If ScoreBrackets is provided, it can be used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) = 0; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) = 0; // Returns an array of bit masks which can be used to map values in // WaitEventType to corresponding counter values in InstCounterType. @@ -356,7 +387,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -393,7 +425,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -1527,38 +1560,104 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + // Emits waitcnts from (Outstanding-1) down to Target, or just Target if + // nothing to expand. The EmitWaitcnt callback emits a single waitcnt. + auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + if (Outstanding > Target) { + for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { + EmitWaitcnt(i); + Modified = true; + } + } else { + EmitWaitcnt(Target); + Modified = true; + } + }; + // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + // If profiling expansion is enabled and we have score brackets, + // emit an expanded sequence + if (ExpandWaitcntProfiling && ScoreBrackets) { + if (Wait.LoadCnt != ~0u) { + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(LOAD_CNT) - + ScoreBrackets->getScoreLB(LOAD_CNT), + getWaitCountMax(LOAD_CNT) - 1); + emitExpandedWaitcnt(Outstanding, Wait.LoadCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + W.LoadCnt = Count; + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } + if (Wait.DsCnt != ~0u) { + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(DS_CNT) - + ScoreBrackets->getScoreLB(DS_CNT), + getWaitCountMax(DS_CNT) - 1); + emitExpandedWaitcnt(Outstanding, Wait.DsCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + W.DsCnt = Count; + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } + if (Wait.ExpCnt != ~0u) { + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(EXP_CNT) - + ScoreBrackets->getScoreLB(EXP_CNT), + getWaitCountMax(EXP_CNT) - 1); + emitExpandedWaitcnt(Outstanding, Wait.ExpCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + W.ExpCnt = Count; + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } + } else { + // Normal behavior: emit single combined waitcnt + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - [[maybe_unused]] auto SWaitInst = + if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) { + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) - + ScoreBrackets->getScoreLB(STORE_CNT), + getWaitCountMax(STORE_CNT) - 1); + emitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) { BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(Count); + }); + } else { + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } return Modified; @@ -1790,13 +1889,47 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + if (Outstanding > Target) { + for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { + EmitWaitcnt(i); + Modified = true; + } + } else { + EmitWaitcnt(Target); + Modified = true; + } + }; + + // For GFX12+, we use separate wait instructions, which makes expansion + // simpler + if (ExpandWaitcntProfiling && ScoreBrackets) { + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = getWait(Wait, CT); + if (Count == ~0u) + continue; + + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - + ScoreBrackets->getScoreLB(CT), + getWaitCountMax(CT) - 1); + emitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) { + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Val); + }); + } + return Modified; + } + + // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -2162,9 +2295,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Modified = WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - // Any counts that could have been applied to any existing waitcnt - // instructions will have been done so, now deal with any remaining. - ScoreBrackets.applyWaitcnt(Wait); + AMDGPU::Waitcnt WaitForScore = Wait; // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -2181,9 +2312,28 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - if (WCG->createNewWaitcnt(Block, It, Wait)) + // XCnt may be already consumed by a load wait. + // When we wait for KmCnt==0, all SMEM operations (including address + // translations) are complete, so XCNT wait is redundant. When we wait for + // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address + // translations are complete (since XCnt follows LoadCnt for loads). When the + // current instruction is a VMEM access, translations are in-order. + if (Wait.XCnt != ~0u) { + if (Wait.KmCnt == 0) + Wait.XCnt = ~0u; + else if (Wait.LoadCnt == 0 && Wait.XCnt == 0) + Wait.XCnt = ~0u; + else if (isVmemAccess(*It)) + Wait.XCnt = ~0u; + } + + if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) Modified = true; + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(WaitForScore); + return Modified; } diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll new file mode 100644 index 0000000000000..3daf3142f2a96 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s + +; This test demonstrates the waitcnt expansion feature for PC-sampling profiling. +; The expansion transforms a single waitcnt instruction into a sequence of waitcnts +; with decreasing counter values to help identify which specific memory operation +; is causing a bottleneck. +; +; The kernels below keep multiple memory operations in flight before each waitcnt +; so that ScoreBrackets tracks a non-zero number of outstanding events. When +; -amdgpu-expand-waitcnt-profiling is enabled, each combined wait is expanded +; into a descending sequence (e.g. outstanding=3 emits lgkmcnt(2), (1), (0)) +; which lets PC-sampling attribute long-latency stalls to the specific operation. + +define amdgpu_kernel void @case1_single_counter_lgkmcnt( +; EXPAND-LABEL: case1_single_counter_lgkmcnt: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(2) +; EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_add_i32 s0, s0, s1 +; EXPAND-NEXT: s_add_i32 s0, s0, s2 +; EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case1_single_counter_lgkmcnt: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(4) %ptr_a, + ptr addrspace(4) %ptr_b, + ptr addrspace(4) %ptr_c, + ptr addrspace(1) %out) { + ; Three scalar loads - increment lgkmcnt + %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 + %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 + %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 + + ; Use all three values + %sum1 = add i32 %val_a, %val_b + %sum2 = add i32 %sum1, %val_c + + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @case2_independent_counters( +; EXPAND-LABEL: case2_independent_counters: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_add_i32 s0, s4, s5 +; EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case2_independent_counters: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(1) %global_ptr, + ptr addrspace(4) %scalar_ptr, + ptr addrspace(1) %out) { + ; Global memory load - increments vmcnt + %global_val = load i32, ptr addrspace(1) %global_ptr, align 4 + + ; Scalar memory load - increments lgkmcnt + %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4 + + ; Use both values - compiler must wait for both counters + %result = add i32 %global_val, %scalar_val + + store i32 %result, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @case3_overlapping_counters( +; EXPAND-LABEL: case3_overlapping_counters: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: v_mov_b32_e32 v1, 1 +; EXPAND-NEXT: v_mov_b32_e32 v2, 2 +; EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: s_add_u32 s2, s2, s6 +; EXPAND-NEXT: s_addc_u32 s3, s3, s7 +; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case3_overlapping_counters: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 +; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: s_add_u32 s2, s2, s6 +; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7 +; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(1) %buf, + ptr addrspace(1) %data, + i64 %offset) { + ; Issue 12 stores to buffer - each increments vmcnt + %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0 + store volatile i32 1, ptr addrspace(1) %ptr0, align 4 + %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1 + store volatile i32 2, ptr addrspace(1) %ptr1, align 4 + %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2 + store volatile i32 1, ptr addrspace(1) %ptr2, align 4 + %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3 + store volatile i32 2, ptr addrspace(1) %ptr3, align 4 + %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4 + store volatile i32 1, ptr addrspace(1) %ptr4, align 4 + %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5 + store volatile i32 2, ptr addrspace(1) %ptr5, align 4 + %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6 + store volatile i32 1, ptr addrspace(1) %ptr6, align 4 + %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7 + store volatile i32 2, ptr addrspace(1) %ptr7, align 4 + %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8 + store volatile i32 1, ptr addrspace(1) %ptr8, align 4 + %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9 + store volatile i32 2, ptr addrspace(1) %ptr9, align 4 + %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10 + store volatile i32 1, ptr addrspace(1) %ptr10, align 4 + %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11 + store volatile i32 2, ptr addrspace(1) %ptr11, align 4 + + ; Load from potentially aliasing address - also increments vmcnt + %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset + %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4 + + ; Store the loaded value + %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12 + store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4 + + ret void +} From a28ab4e6c7b0079a0b7d38aab7ec4cc1a3d926af Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Wed, 26 Nov 2025 13:09:43 +0530 Subject: [PATCH 2/5] add run line for diff GPU Gen and counter types --- .../AMDGPU/expand-waitcnt-profiling.ll | 790 +++++++++++++----- 1 file changed, 577 insertions(+), 213 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll index 3daf3142f2a96..6a0b053d315de 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -1,230 +1,594 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s - -; This test demonstrates the waitcnt expansion feature for PC-sampling profiling. -; The expansion transforms a single waitcnt instruction into a sequence of waitcnts -; with decreasing counter values to help identify which specific memory operation -; is causing a bottleneck. -; -; The kernels below keep multiple memory operations in flight before each waitcnt -; so that ScoreBrackets tracks a non-zero number of outstanding events. When -; -amdgpu-expand-waitcnt-profiling is enabled, each combined wait is expanded -; into a descending sequence (e.g. outstanding=3 emits lgkmcnt(2), (1), (0)) -; which lets PC-sampling attribute long-latency stalls to the specific operation. - -define amdgpu_kernel void @case1_single_counter_lgkmcnt( -; EXPAND-LABEL: case1_single_counter_lgkmcnt: -; EXPAND: ; %bb.0: -; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; EXPAND-NEXT: s_waitcnt lgkmcnt(2) -; EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; EXPAND-NEXT: s_add_i32 s0, s0, s1 -; EXPAND-NEXT: s_add_i32 s0, s0, s2 -; EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; EXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; EXPAND-NEXT: s_endpgm -; -; NOEXPAND-LABEL: case1_single_counter_lgkmcnt: -; NOEXPAND: ; %bb.0: -; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; NOEXPAND-NEXT: s_add_i32 s0, s0, s1 -; NOEXPAND-NEXT: s_add_i32 s0, s0, s2 -; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; NOEXPAND-NEXT: s_endpgm - ptr addrspace(4) %ptr_a, - ptr addrspace(4) %ptr_b, - ptr addrspace(4) %ptr_c, - ptr addrspace(1) %out) { - ; Three scalar loads - increment lgkmcnt +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX9-EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9-NOEXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX10-EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10-NOEXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX11-EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11-NOEXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-expand-waitcnt-profiling < %s | FileCheck --check-prefix=GFX12-EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12-NOEXPAND %s + +; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding +; operations, instead of emitting a single waitcnt(target), we emit: +; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target) +; +; This allows PC-sampling profilers to identify which specific operation +; is causing a stall by observing where the program counter is stuck. + +define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) { +; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x2 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-NOEXPAND-NEXT: s_endpgm + %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 - - ; Use all three values %sum1 = add i32 %val_a, %val_b %sum2 = add i32 %sum1, %val_c - store i32 %sum2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @case2_independent_counters( -; EXPAND-LABEL: case2_independent_counters: -; EXPAND: ; %bb.0: -; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; EXPAND-NEXT: s_add_i32 s0, s4, s5 -; EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; EXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; EXPAND-NEXT: s_endpgm -; -; NOEXPAND-LABEL: case2_independent_counters: -; NOEXPAND: ; %bb.0: -; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; NOEXPAND-NEXT: s_add_i32 s0, s4, s5 -; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; NOEXPAND-NEXT: s_endpgm - ptr addrspace(1) %global_ptr, - ptr addrspace(4) %scalar_ptr, - ptr addrspace(1) %out) { - ; Global memory load - increments vmcnt - %global_val = load i32, ptr addrspace(1) %global_ptr, align 4 - - ; Scalar memory load - increments lgkmcnt - %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4 - - ; Use both values - compiler must wait for both counters - %result = add i32 %global_val, %scalar_val +define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) { +; GFX9-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_clause 0x2 +; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_clause 0x2 +; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_clause 0x2 +; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_clause 0x2 +; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_clause 0x2 +; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0 +; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_clause 0x2 +; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NOEXPAND-NEXT: s_endpgm + + ; Use thread ID to create thread-varying addresses -> forces vector loads + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid64 = zext i32 %tid to i64 - store i32 %result, ptr addrspace(1) %out, align 4 + ; Three separate global loads with thread-varying addresses + ; Non-volatile loads allow multiple operations to be in-flight + %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 + %val0 = load i32, ptr addrspace(1) %ptr0, align 4 + + %offset1 = add i64 %tid64, 64 + %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 + %val1 = load i32, ptr addrspace(1) %ptr1, align 4 + + %offset2 = add i64 %tid64, 128 + %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 + %val2 = load i32, ptr addrspace(1) %ptr2, align 4 + + %sum1 = add i32 %val0, %val1 + %sum2 = add i32 %sum1, %val2 + + %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64 + store i32 %sum2, ptr addrspace(1) %out_ptr, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) { +; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0 + %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 + %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2 + %val0 = load i32, ptr addrspace(3) %ptr0, align 4 + %val1 = load i32, ptr addrspace(3) %ptr1, align 4 + %val2 = load i32, ptr addrspace(3) %ptr2, align 4 + %sum1 = add i32 %val0, %val1 + %sum2 = add i32 %sum1, %val2 + store i32 %sum2, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @case3_overlapping_counters( -; EXPAND-LABEL: case3_overlapping_counters: -; EXPAND: ; %bb.0: -; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; EXPAND-NEXT: v_mov_b32_e32 v1, 1 -; EXPAND-NEXT: v_mov_b32_e32 v2, 2 -; EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: s_add_u32 s2, s2, s6 -; EXPAND-NEXT: s_addc_u32 s3, s3, s7 -; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 -; EXPAND-NEXT: s_waitcnt vmcnt(0) -; EXPAND-NEXT: s_endpgm -; -; NOEXPAND-LABEL: case3_overlapping_counters: -; NOEXPAND: ; %bb.0: -; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 -; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 -; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: s_add_u32 s2, s2, s6 -; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7 -; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 -; NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; NOEXPAND-NEXT: s_endpgm - ptr addrspace(1) %buf, - ptr addrspace(1) %data, - i64 %offset) { - ; Issue 12 stores to buffer - each increments vmcnt - %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0 - store volatile i32 1, ptr addrspace(1) %ptr0, align 4 - %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1 - store volatile i32 2, ptr addrspace(1) %ptr1, align 4 - %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2 - store volatile i32 1, ptr addrspace(1) %ptr2, align 4 - %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3 - store volatile i32 2, ptr addrspace(1) %ptr3, align 4 - %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4 - store volatile i32 1, ptr addrspace(1) %ptr4, align 4 - %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5 - store volatile i32 2, ptr addrspace(1) %ptr5, align 4 - %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6 - store volatile i32 1, ptr addrspace(1) %ptr6, align 4 - %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7 - store volatile i32 2, ptr addrspace(1) %ptr7, align 4 - %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8 - store volatile i32 1, ptr addrspace(1) %ptr8, align 4 - %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9 - store volatile i32 2, ptr addrspace(1) %ptr9, align 4 - %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10 - store volatile i32 1, ptr addrspace(1) %ptr10, align 4 - %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11 - store volatile i32 2, ptr addrspace(1) %ptr11, align 4 - - ; Load from potentially aliasing address - also increments vmcnt - %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset - %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4 - - ; Store the loaded value - %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12 - store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4 +define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) { +; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4 + %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4 + %result = add i32 %scalar_val1, %scalar_val2 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } From 7e993fb33983b5a1912a840940f6d18d3ab14b06 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Wed, 26 Nov 2025 18:49:41 +0530 Subject: [PATCH 3/5] Address reviewer feedback: fix getWaitCountMax and reduce code duplication - Fix getWaitCountMax() to use correct bitmasks based on architecture: - Pre-GFX12: Use getVmcntBitMask/getLgkmcntBitMask for LOAD_CNT/DS_CNT - GFX12+: Use getLoadcntBitMask/getDscntBitMask for LOAD_CNT/DS_CNT - Refactor repetitive if-blocks for LOAD_CNT, DS_CNT, EXP_CNT into a single loop using getCounterRef helper function - Fix X_CNT to return proper getXcntBitMask(IV) instead of 0 --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 50 ++++++++------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b86a75e9b04ed..5aba3e2833b9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -302,13 +302,17 @@ class WaitcntGenerator { // optimization. bool isOptNone() const { return OptNone; } - // Get the maximum wait count value for a given counter type + // Get the maximum wait count value for a given counter type. + // For pre-GFX12, LOAD_CNT uses vmcnt and DS_CNT uses lgkmcnt. + // For GFX12+, LOAD_CNT uses loadcnt and DS_CNT uses dscnt. unsigned getWaitCountMax(InstCounterType T) const { switch (T) { case LOAD_CNT: - return AMDGPU::getLoadcntBitMask(IV); + return ST->hasExtendedWaitCounts() ? AMDGPU::getLoadcntBitMask(IV) + : AMDGPU::getVmcntBitMask(IV); case DS_CNT: - return AMDGPU::getDscntBitMask(IV); + return ST->hasExtendedWaitCounts() ? AMDGPU::getDscntBitMask(IV) + : AMDGPU::getLgkmcntBitMask(IV); case EXP_CNT: return AMDGPU::getExpcntBitMask(IV); case STORE_CNT: @@ -320,7 +324,7 @@ class WaitcntGenerator { case KM_CNT: return AMDGPU::getKmcntBitMask(IV); case X_CNT: - return 0; // No hardware limit for XCNT + return AMDGPU::getXcntBitMask(IV); default: return 0; } @@ -1589,35 +1593,17 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( // If profiling expansion is enabled and we have score brackets, // emit an expanded sequence if (ExpandWaitcntProfiling && ScoreBrackets) { - if (Wait.LoadCnt != ~0u) { - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(LOAD_CNT) - - ScoreBrackets->getScoreLB(LOAD_CNT), - getWaitCountMax(LOAD_CNT) - 1); - emitExpandedWaitcnt(Outstanding, Wait.LoadCnt, [&](unsigned Count) { - AMDGPU::Waitcnt W; - W.LoadCnt = Count; - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(IV, W)); - }); - } - if (Wait.DsCnt != ~0u) { - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(DS_CNT) - - ScoreBrackets->getScoreLB(DS_CNT), - getWaitCountMax(DS_CNT) - 1); - emitExpandedWaitcnt(Outstanding, Wait.DsCnt, [&](unsigned Count) { - AMDGPU::Waitcnt W; - W.DsCnt = Count; - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(IV, W)); - }); - } - if (Wait.ExpCnt != ~0u) { - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(EXP_CNT) - - ScoreBrackets->getScoreLB(EXP_CNT), - getWaitCountMax(EXP_CNT) - 1); - emitExpandedWaitcnt(Outstanding, Wait.ExpCnt, [&](unsigned Count) { + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned &WaitCnt = getCounterRef(Wait, CT); + if (WaitCnt == ~0u) + continue; + + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - + ScoreBrackets->getScoreLB(CT), + getWaitCountMax(CT) - 1); + emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { AMDGPU::Waitcnt W; - W.ExpCnt = Count; + getCounterRef(W, CT) = Count; BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) .addImm(AMDGPU::encodeWaitcnt(IV, W)); }); From 709640d569e7ea3c886dc98a3b51c04aaff4fd70 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Tue, 2 Dec 2025 13:20:55 +0530 Subject: [PATCH 4/5] skip expanding out-of-order events --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 54 ++++-- .../AMDGPU/expand-waitcnt-profiling.ll | 163 +++++++++++++++--- 2 files changed, 185 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5aba3e2833b9c..bbf73dd4d748c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1593,20 +1593,40 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( // If profiling expansion is enabled and we have score brackets, // emit an expanded sequence if (ExpandWaitcntProfiling && ScoreBrackets) { + // Check if any of the counters to be waited on are out-of-order. + // If so, fall back to normal (non-expanded) behavior since expansion + // would provide misleading profiling information. + bool AnyOutOfOrder = false; for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { unsigned &WaitCnt = getCounterRef(Wait, CT); - if (WaitCnt == ~0u) - continue; + if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) { + AnyOutOfOrder = true; + break; + } + } + + if (AnyOutOfOrder) { + // Fall back to non-expanded wait + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + } else { + // All counters are in-order, safe to expand + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned &WaitCnt = getCounterRef(Wait, CT); + if (WaitCnt == ~0u) + continue; - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - - ScoreBrackets->getScoreLB(CT), - getWaitCountMax(CT) - 1); - emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { - AMDGPU::Waitcnt W; - getCounterRef(W, CT) = Count; - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(IV, W)); - }); + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - + ScoreBrackets->getScoreLB(CT), + getWaitCountMax(CT) - 1); + emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + getCounterRef(W, CT) = Count; + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } } } else { // Normal behavior: emit single combined waitcnt @@ -1624,7 +1644,9 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) { + if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u && + !ScoreBrackets->counterOutOfOrder(STORE_CNT)) { + // Only expand if counter is not out-of-order unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) - ScoreBrackets->getScoreLB(STORE_CNT), getWaitCountMax(STORE_CNT) - 1); @@ -1904,6 +1926,14 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( if (Count == ~0u) continue; + // Skip expansion for out-of-order counters - emit normal wait instead + if (ScoreBrackets->counterOutOfOrder(CT)) { + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Count); + Modified = true; + continue; + } + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - ScoreBrackets->getScoreLB(CT), getWaitCountMax(CT) - 1); diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll index 6a0b053d315de..ec30477e34ba2 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -24,8 +24,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt ; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 @@ -56,8 +54,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt ; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 @@ -87,8 +83,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(2) -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -119,8 +113,6 @@ define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, pt ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x2 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -313,7 +305,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr ; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -346,7 +337,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr ; GFX10-EXPAND-NEXT: s_clause 0x1 ; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -381,7 +371,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr ; GFX11-EXPAND-NEXT: s_clause 0x1 ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 @@ -416,7 +405,6 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr ; GFX12-EXPAND-NEXT: s_clause 0x1 ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 @@ -464,11 +452,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 @@ -495,11 +481,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 @@ -526,11 +510,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ ; GFX11-EXPAND-NEXT: s_clause 0x1 ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -558,11 +540,9 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ ; GFX12-EXPAND-NEXT: s_clause 0x1 ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x1 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -592,3 +572,146 @@ define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ store i32 %result, ptr addrspace(1) %out, align 4 ret void } + +; Test that expansion is NOT applied when counters are out-of-order (mixed event types). +; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete +; out-of-order relative to each other. When both are in-flight, we should NOT expand +; because the expansion would be misleading. +define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) { +; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4 + %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4 + %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 + %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4 + %sum1 = add i32 %lds_val1, %lds_val2 + %sum2 = add i32 %sum1, %smem_val + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} From 399166c09c9dfa699bef321e238e55d4501ba600 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Tue, 2 Dec 2025 18:32:41 +0530 Subject: [PATCH 5/5] fix: resolve issue after rebase --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index bbf73dd4d748c..d189733171bb1 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2328,21 +2328,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - // XCnt may be already consumed by a load wait. - // When we wait for KmCnt==0, all SMEM operations (including address - // translations) are complete, so XCNT wait is redundant. When we wait for - // LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address - // translations are complete (since XCnt follows LoadCnt for loads). When the - // current instruction is a VMEM access, translations are in-order. - if (Wait.XCnt != ~0u) { - if (Wait.KmCnt == 0) - Wait.XCnt = ~0u; - else if (Wait.LoadCnt == 0 && Wait.XCnt == 0) - Wait.XCnt = ~0u; - else if (isVmemAccess(*It)) - Wait.XCnt = ~0u; - } - if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) Modified = true;