diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index edd3ce72d7df3..1038797374de3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1902,6 +1902,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet< def FeatureISAVersion11_Common : FeatureSet< [FeatureGFX11, + FeatureBackOffBarrier, FeatureLDSBankCount32, FeatureDLInsts, FeatureDot5Insts, @@ -1985,6 +1986,7 @@ def FeatureISAVersion11_5_3 : FeatureSet< def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, + FeatureBackOffBarrier, FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureDLInsts, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index fdbd9ce4a66bf..f7c7bb509c9ef 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -983,13 +983,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; } - bool isBarrier(unsigned Opcode) const { + // Check to see if opcode is for a barrier start. Pre gfx12 this is just the + // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want + // to check for the barrier start (S_BARRIER_SIGNAL*) + bool isBarrierStart(unsigned Opcode) const { return Opcode == AMDGPU::S_BARRIER || Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; + } + + bool isBarrier(unsigned Opcode) const { + return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || Opcode == AMDGPU::S_BARRIER_INIT_M0 || Opcode == AMDGPU::S_BARRIER_INIT_IMM || Opcode == AMDGPU::S_BARRIER_JOIN_IMM || diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 53f554eccb1fb..3006eaf468bdd 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,6 +359,12 @@ class SICacheControl { bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; + /// Inserts any necessary instructions before the barrier start instruction + /// \p MI in order to support pairing of barriers and fences. + virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { + return false; + }; + /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -547,6 +553,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -2169,6 +2177,21 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx10CacheControl::insertBarrierStart( + MachineBasicBlock::iterator &MI) const { + // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU + // mode. This is because a CU mode release fence does not emit any wait, which + // is fine when only dealing with vmem, but isn't sufficient in the presence + // of barriers which do not go through vmem. + if (!ST.isCuModeEnabled()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + return true; +} + bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2840,7 +2863,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { bool Changed = false; SIMemOpAccess MOA(MMI.getObjFileInfo()); - CC = SICacheControl::create(MF.getSubtarget()); + const GCNSubtarget &ST = MF.getSubtarget(); + CC = SICacheControl::create(ST); for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2860,6 +2884,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } + if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { + Changed |= CC->insertBarrierStart(MI); + continue; + } + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll index b584f6df647ce..122e683f79664 100644 --- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll @@ -54,9 +54,8 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 { ; GFX11-BACKOFF: ; %bb.0: ; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1] -; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-BACKOFF-NEXT: s_barrier +; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0 ; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-BACKOFF-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index d23509b5aa812..b91963f08681c 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -150,6 +150,7 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds ; GFX10CU-NEXT: v_mov_b32_e32 v0, s13 ; GFX10CU-NEXT: s_waitcnt vmcnt(0) +; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10CU-NEXT: s_barrier ; GFX10CU-NEXT: ds_read_b32 v0, v0 ; GFX10CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 7d44d9178f941..4b1d1560ebe0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -214,9 +214,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_barrier ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll new file mode 100644 index 0000000000000..e921f581c00a7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define amdgpu_kernel void @test_s_barrier() { +; GFX10-WGP-LABEL: test_s_barrier: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: test_s_barrier: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_barrier +; GFX10-CU-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: test_s_barrier: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: test_s_barrier: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_barrier +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: test_s_barrier: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: test_s_barrier: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_barrier_signal -1 +; GFX12-CU-NEXT: s_barrier_wait -1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: test_s_barrier: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_alu 0xffe3 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.s.barrier() + ret void +} + +define amdgpu_kernel void @test_s_barrier_workgroup_fence() { +; GFX10-WGP-LABEL: test_s_barrier_workgroup_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: test_s_barrier_workgroup_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_barrier +; GFX10-CU-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: test_s_barrier_workgroup_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: test_s_barrier_workgroup_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_barrier +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: test_s_barrier_workgroup_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: test_s_barrier_workgroup_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_barrier_signal -1 +; GFX12-CU-NEXT: s_barrier_wait -1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: test_s_barrier_workgroup_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_alu 0xffe3 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("workgroup") release + call void @llvm.amdgcn.s.barrier() + ret void +} + +define amdgpu_kernel void @test_s_barrier_agent_fence() { +; GFX10-WGP-LABEL: test_s_barrier_agent_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: test_s_barrier_agent_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-CU-NEXT: s_barrier +; GFX10-CU-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: test_s_barrier_agent_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: test_s_barrier_agent_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3 +; GFX11-CU-NEXT: s_barrier +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: test_s_barrier_agent_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: test_s_barrier_agent_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xffe3 +; GFX12-CU-NEXT: s_barrier_signal -1 +; GFX12-CU-NEXT: s_barrier_wait -1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: test_s_barrier_agent_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_alu 0xffe3 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("agent") release + call void @llvm.amdgcn.s.barrier() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir index 292091ad6e1e7..bb47392e7b7d7 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir @@ -62,7 +62,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_WAITCNT 0 ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1 ; GFX11-NEXT: S_BARRIER ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX11-NEXT: S_WAITCNT 7 @@ -176,7 +176,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_WAITCNT 0 ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1 ; GFX11-NEXT: S_BARRIER ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX11-NEXT: S_WAITCNT 7