From fd0d010d4a6bba5cf056d4774d476369462a2355 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Tue, 2 Dec 2025 13:56:46 +0530 Subject: [PATCH] [AMDGPU][Waitcnts] Don't create a pending flat event for LDS DMA Flat instructions need a waitcnt(0) on both VMEM and LDS accesses, but only when the instruction really is using flat addressing. The LDS DMA instructions (on GFX9) have the FLAT flag set, but they have very clear semantics. These instructions update only VM_CNT (on GFX9), and hence do not need to be treated like actual flat instructions. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 11 +++++++---- llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 70db7b4918515..39b001b835ed2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2291,10 +2291,13 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } - // This is a flat memory operation that access both VMEM and LDS, so note it - // - it will require that both the VM and LGKM be flushed to zero if it is - // pending when a VM or LGKM dependency occurs. - if (FlatASCount > 1) + // If this is a truly flat memory operation, then it accesss both VMEM and + // LDS, so note it - it will require that both the VM and LGKM be flushed to + // zero if it is pending when a VM or LGKM dependency occurs. + // + // For example, LDS DMA operations have FLAT set in their TSFlags for + // unspecified reasons, but they are not flat operations) + if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 2df8be55de3a8..37ba1f42413c9 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -107,9 +107,10 @@ define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocap ; GFX9-NEXT: s_lshl_b32 s1, s3, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v1, v1 offset:256 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]