diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c501ebba0c7ed..c85d2bb9fe9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2514,6 +2514,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const { + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -2521,53 +2523,51 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // writeback as all memory operations by the same thread are // sequentially consistent, and no other thread can access scratch // memory. + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (Pos == Position::AFTER) + ++MI; - // Other address spaces do not have a cache. - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) - return false; - - if (Pos == Position::AFTER) - ++MI; - - // global_wb is only necessary at system scope for GFX12.0, - // they're also necessary at device scope for GFX12.5. - // - // Emitting it for lower scopes is a slow no-op, so we omit it - // for performance. - switch (Scope) { - case SIAtomicScope::SYSTEM: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_SYS); - break; - case SIAtomicScope::AGENT: - // TODO DOCS - if (ST.hasGFX1250Insts()) { + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. + // + // Emitting it for lower scopes is a slow no-op, so we omit it + // for performance. + switch (Scope) { + case SIAtomicScope::SYSTEM: BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_DEV); + .addImm(AMDGPU::CPol::SCOPE_SYS); + Changed = true; + break; + case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + Changed = true; + } + break; + case SIAtomicScope::CLUSTER: + case SIAtomicScope::WORKGROUP: + // No WB necessary, but we still have to wait. + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No WB or wait necessary here, but insertWait takes care of that. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - break; - case SIAtomicScope::CLUSTER: - case SIAtomicScope::WORKGROUP: - // No WB necessary, but we still have to wait. - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No WB or wait necessary here. - return false; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - if (Pos == Position::AFTER) - --MI; + if (Pos == Position::AFTER) + --MI; + } // We always have to wait for previous memory operations (load/store) to // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. - insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); - return true; + return Changed; } bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll new file mode 100644 index 0000000000000..1e6dc4e06ef4d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + + +define float @test_barrier_workgroup_local_mmra(ptr addrspace(3) noundef %x, ptr addrspace(3) noundef %y, float %val) { +; GFX10-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX10-WGP: ; %bb.0: +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: ds_read_b32 v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX11-WGP: ; %bb.0: +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: ds_load_b32 v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX12-WGP: ; %bb.0: +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v2 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: ds_load_b32 v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_barrier_workgroup_local_mmra: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: ds_load_b32 v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store float %val, ptr addrspace(3) %x + fence syncscope("workgroup") release, !mmra !0 + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire, !mmra !0 + %ret = load float, ptr addrspace(3) %y + ret float %ret +} + +define float @test_barrier_workgroup_global_mmra(ptr addrspace(1) noundef %x, ptr addrspace(1) noundef %y, float %val) { +; GFX10-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX10-WGP: ; %bb.0: +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v4, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_load_dword v0, v[2:3], off +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX11-WGP: ; %bb.0: +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_load_b32 v0, v[2:3], off +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX12-WGP: ; %bb.0: +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v4, off +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: global_load_b32 v0, v[2:3], off +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_barrier_workgroup_global_mmra: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: global_load_b32 v0, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store float %val, ptr addrspace(1) %x + fence syncscope("workgroup") release, !mmra !1 + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire, !mmra !1 + %ret = load float, ptr addrspace(1) %y + ret float %ret +} + +!0 = !{!"amdgpu-synchronize-as", !"local"} +!1 = !{!"amdgpu-synchronize-as", !"global"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index cc42428e1aa06..8b0b099999f06 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -143,14 +143,17 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -213,14 +216,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -283,14 +289,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -670,14 +679,17 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -740,14 +752,17 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -810,14 +825,17 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1197,14 +1215,17 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1267,14 +1288,17 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1337,14 +1361,17 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}