diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c85d2bb9fe9ae..ba6c29a855ebf 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -637,6 +637,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; + bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override { @@ -2174,17 +2176,19 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, bool SIGfx10CacheControl::insertBarrierStart( MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) + if (!ST.isCuModeEnabled()) return false; + // GFX10/11 CU MODE Workgroup fences do not emit anything. + // In the presence of barriers, we want to make sure previous memory + // operations are actually visible and can be released at a wider scope by + // another thread upon exiting the barrier. To make this possible, we must + // wait on previous stores. + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); return true; } @@ -2570,6 +2574,23 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx12CacheControl::insertBarrierStart( + MachineBasicBlock::iterator &MI) const { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) + return false; + + // GFX12 CU MODE Workgroup fences do not emit anything (except in GFX12.5). + // In the presence of barriers, we want to make sure previous memory + // operations are actually visible and can be released at a wider scope by + // another thread upon exiting the barrier. To make this possible, we must + // wait on previous stores. + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAIT_STORECNT_soft)) + .addImm(0); + return true; +} + bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index b91963f08681c..d23509b5aa812 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 ; GFX10CU-NEXT: s_waitcnt vmcnt(0) -; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10CU-NEXT: s_barrier ; GFX10CU-NEXT: ds_read_b32 v0, v0 ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll index 516c3946f63dc..b5c3cce160a22 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX10-CU-LABEL: test_s_barrier: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX11-CU-LABEL: test_s_barrier: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -38,7 +38,7 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX12-CU-LABEL: test_s_barrier: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; GFX10-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; GFX11-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -94,8 +94,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX12-CU-LABEL: test_s_barrier_workgroup_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm @@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-CU-NEXT: s_barrier ; GFX10-CU-NEXT: s_endpgm ; @@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX11-CU-NEXT: s_barrier ; GFX11-CU-NEXT: s_endpgm ; @@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: s_wait_alu 0xffe3 ; GFX12-CU-NEXT: s_barrier_signal -1 ; GFX12-CU-NEXT: s_barrier_wait -1 ; GFX12-CU-NEXT: s_endpgm