diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 70db7b4918515..39b001b835ed2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2291,10 +2291,13 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } - // This is a flat memory operation that access both VMEM and LDS, so note it - // - it will require that both the VM and LGKM be flushed to zero if it is - // pending when a VM or LGKM dependency occurs. - if (FlatASCount > 1) + // If this is a truly flat memory operation, then it accesss both VMEM and + // LDS, so note it - it will require that both the VM and LGKM be flushed to + // zero if it is pending when a VM or LGKM dependency occurs. + // + // For example, LDS DMA operations have FLAT set in their TSFlags for + // unspecified reasons, but they are not flat operations) + if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 2df8be55de3a8..37ba1f42413c9 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -107,9 +107,10 @@ define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocap ; GFX9-NEXT: s_lshl_b32 s1, s3, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v1, v1 offset:256 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]