diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3149ebc2f0bf4..75091e5dad179 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -148,6 +148,7 @@ class SIMemOpInfo final { bool IsNonTemporal = false; bool IsLastUse = false; bool IsCooperative = false; + bool IsAVNone = false; // TODO: Should we assume Cooperative=true if no MMO is present? SIMemOpInfo( @@ -160,12 +161,12 @@ class SIMemOpInfo final { AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, bool IsVolatile = false, bool IsNonTemporal = false, bool IsLastUse = false, bool IsCooperative = false, - bool CanDemoteWorkgroupToWavefront = false) + bool CanDemoteWorkgroupToWavefront = false, bool IsAVNone = false) : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), - IsLastUse(IsLastUse), IsCooperative(IsCooperative) { + IsLastUse(IsLastUse), IsCooperative(IsCooperative), IsAVNone(IsAVNone) { if (Ordering == AtomicOrdering::NotAtomic) { assert(!IsCooperative && "Cannot be cooperative & non-atomic!"); @@ -277,6 +278,9 @@ class SIMemOpInfo final { /// \returns True if this is a cooperative load or store atomic. bool isCooperative() const { return IsCooperative; } + /// \returns True if MakeAvailable/MakeVisible should be suppressed. + bool isAVNone() const { return IsAVNone; } + /// \returns True if ordering constraint of the machine instruction used to /// create this SIMemOpInfo is unordered or higher, false otherwise. bool isAtomic() const { @@ -451,12 +455,11 @@ class SICacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const = 0; - /// Inserts writeback followed by an unconditional wait to implement a - /// release operation. + /// Inserts writeback (unless \p IsAVNone) followed by an unconditional wait. bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = insertWriteback(MI, Scope, AddrSpace, Pos); + Position Pos, bool IsAVNone) const { + bool Changed = !IsAVNone && insertWriteback(MI, Scope, AddrSpace, Pos); Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, /*AtomicsOnly=*/false); @@ -732,6 +735,33 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) { return Result; } +static void diagnoseUnknownAVMetadata(const MachineInstr &MI, + StringRef Suffix) { + const MachineFunction *MF = MI.getMF(); + const Function &Fn = MF->getFunction(); + SmallString<128> Str; + raw_svector_ostream OS(Str); + OS << "unknown amdgcn-av metadata '" << Suffix << '\''; + Fn.getContext().diagnose( + DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); +} + +static bool hasAVNoneMMRA(const MachineInstr &MI) { + auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); + if (!MMRA) + return false; + bool TagFound = false; + for (const auto &[Prefix, Suffix] : MMRA) { + if (Prefix != "amdgcn-av") + continue; + if (Suffix == "none") + TagFound = true; + else + diagnoseUnknownAVMetadata(MI, Suffix); + } + return TagFound; +} + } // end anonymous namespace void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, @@ -875,7 +905,7 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, IsNonTemporal, IsLastUse, IsCooperative, - CanDemoteWorkgroupToWavefront); + CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI)); } std::optional @@ -945,7 +975,7 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic, false, false, false, false, - CanDemoteWorkgroupToWavefront); + CanDemoteWorkgroupToWavefront, hasAVNoneMMRA(*MI)); } std::optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( @@ -2317,9 +2347,10 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order, /*AtomicsOnly=*/true); - Changed |= CC->insertAcquire(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - Position::AFTER); + if (!MOI.isAVNone()) { + Changed |= CC->insertAcquire( + MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); + } } return Changed; @@ -2363,11 +2394,12 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, Changed |= CC->handleCooperativeAtomic(*MI); if (MOI.getOrdering() == AtomicOrdering::Release || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= CC->insertRelease(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= + CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE, MOI.isAVNone()); + } Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; @@ -2412,7 +2444,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, if (Order == AtomicOrdering::Release || Order == AtomicOrdering::AcquireRelease || - Order == AtomicOrdering::SequentiallyConsistent) + Order == AtomicOrdering::SequentiallyConsistent) { /// TODO: This relies on a barrier always generating a waitcnt /// for LDS to ensure it is not reordered with the completion of /// the proceeding LDS operations. If barrier had a memory @@ -2422,18 +2454,21 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, /// adding S_WAITCNT before a S_BARRIER. Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + Position::BEFORE, MOI.isAVNone()); + } // TODO: If both release and invalidate are happening they could be combined // to use the single "BUFFER_WBINV*" instruction. This could be done by // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to // track cache invalidate and write back instructions. - if (Order == AtomicOrdering::Acquire || - Order == AtomicOrdering::AcquireRelease || - Order == AtomicOrdering::SequentiallyConsistent) + if ((Order == AtomicOrdering::Acquire || + Order == AtomicOrdering::AcquireRelease || + Order == AtomicOrdering::SequentiallyConsistent) && + !MOI.isAVNone()) { Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, Position::BEFORE); + } return Changed; } @@ -2469,11 +2504,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, if (Order == AtomicOrdering::Release || Order == AtomicOrdering::AcquireRelease || Order == AtomicOrdering::SequentiallyConsistent || - MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= CC->insertRelease(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= + CC->insertRelease(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE, MOI.isAVNone()); + } if (Order == AtomicOrdering::Acquire || Order == AtomicOrdering::AcquireRelease || @@ -2486,9 +2522,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order, /*AtomicsOnly=*/true); - Changed |= CC->insertAcquire(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - Position::AFTER); + if (!MOI.isAVNone()) { + Changed |= CC->insertAcquire( + MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); + } } Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll new file mode 100644 index 0000000000000..89230fe1b7cdd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-none.ll @@ -0,0 +1,722 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s + +; Test that !amdgcn-av-none suppresses MakeAvailable/MakeVisible (cache +; writeback/invalidation) while preserving ordering (waits). + +; Fences: one per scope, varying orderings. + +define amdgpu_kernel void @workgroup_acq_rel_fence_av_none() { +; GFX90A-LABEL: workgroup_acq_rel_fence_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: workgroup_acq_rel_fence_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: workgroup_acq_rel_fence_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acq_rel, !mmra !0 + ret void +} + +define amdgpu_kernel void @cluster_seq_cst_fence_av_none() { +; GFX90A-LABEL: cluster_seq_cst_fence_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_seq_cst_fence_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_seq_cst_fence_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_seq_cst_fence_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +entry: + fence syncscope("cluster") seq_cst, !mmra !0 + ret void +} + +define amdgpu_kernel void @agent_acquire_fence_av_none() { +; GFX90A-LABEL: agent_acquire_fence_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_acquire_fence_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: agent_acquire_fence_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: agent_acquire_fence_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") acquire, !mmra !0 + ret void +} + +define amdgpu_kernel void @agent_release_fence_av_none() { +; GFX90A-LABEL: agent_release_fence_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_release_fence_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: agent_release_fence_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: agent_release_fence_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") release, !mmra !0 + ret void +} + +define amdgpu_kernel void @system_seq_cst_fence_av_none() { +; GFX90A-LABEL: system_seq_cst_fence_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: system_seq_cst_fence_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: system_seq_cst_fence_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +entry: + fence seq_cst, !mmra !0 + ret void +} + +; Atomic loads: acquire across scopes. + +define i32 @workgroup_acquire_load_av_none(ptr addrspace(1) %ptr) { +; GFX90A-LABEL: workgroup_acquire_load_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_load_dword v0, v[0:1], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: workgroup_acquire_load_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: workgroup_acquire_load_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SE +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: workgroup_acquire_load_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %val = load atomic i32, ptr addrspace(1) %ptr syncscope("workgroup") acquire, align 4, !mmra !0 + ret i32 %val +} + +define i32 @agent_acquire_load_av_none(ptr addrspace(1) %ptr) { +; GFX90A-LABEL: agent_acquire_load_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_load_dword v0, v[0:1], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: agent_acquire_load_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: agent_acquire_load_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: agent_acquire_load_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %val = load atomic i32, ptr addrspace(1) %ptr syncscope("agent") acquire, align 4, !mmra !0 + ret i32 %val +} + +define i32 @system_acquire_load_av_none(ptr addrspace(1) %ptr) { +; GFX90A-LABEL: system_acquire_load_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_load_dword v0, v[0:1], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: system_acquire_load_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v[0:1], off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: system_acquire_load_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: system_acquire_load_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %val = load atomic i32, ptr addrspace(1) %ptr acquire, align 4, !mmra !0 + ret i32 %val +} + +; Atomic stores: release across scopes. + +define void @workgroup_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) { +; GFX90A-LABEL: workgroup_release_store_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: workgroup_release_store_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: workgroup_release_store_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SE +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: workgroup_release_store_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + store atomic i32 %val, ptr addrspace(1) %ptr syncscope("workgroup") release, align 4, !mmra !0 + ret void +} + +define void @agent_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) { +; GFX90A-LABEL: agent_release_store_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: agent_release_store_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: agent_release_store_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: agent_release_store_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + store atomic i32 %val, ptr addrspace(1) %ptr syncscope("agent") release, align 4, !mmra !0 + ret void +} + +define void @system_release_store_av_none(ptr addrspace(1) %ptr, i32 %val) { +; GFX90A-LABEL: system_release_store_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: system_release_store_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: system_release_store_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: system_release_store_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + store atomic i32 %val, ptr addrspace(1) %ptr release, align 4, !mmra !0 + ret void +} + +; Atomicrmw: agent acq_rel and system seq_cst. + +define i32 @agent_acq_rel_atomicrmw_av_none(ptr addrspace(1) %ptr, i32 %val) { +; GFX90A-LABEL: agent_acq_rel_atomicrmw_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: agent_acq_rel_atomicrmw_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_add v0, v[0:1], v2, off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: agent_acq_rel_atomicrmw_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: agent_acq_rel_atomicrmw_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %ret = atomicrmw add ptr addrspace(1) %ptr, i32 %val syncscope("agent") acq_rel, !mmra !0 + ret i32 %ret +} + +define i32 @system_seq_cst_atomicrmw_av_none(ptr addrspace(1) %ptr, i32 %val) { +; GFX90A-LABEL: system_seq_cst_atomicrmw_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: system_seq_cst_atomicrmw_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_add v0, v[0:1], v2, off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: system_seq_cst_atomicrmw_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: system_seq_cst_atomicrmw_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %ret = atomicrmw add ptr addrspace(1) %ptr, i32 %val seq_cst, !mmra !0 + ret i32 %ret +} + +; Cmpxchg: cluster acq_rel. + +define { i32, i1 } @cluster_acq_rel_cmpxchg_av_none(ptr addrspace(1) %ptr, i32 %cmp, i32 %new) { +; GFX90A-LABEL: cluster_acq_rel_cmpxchg_av_none: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap v0, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-TGSPLIT-LABEL: cluster_acq_rel_cmpxchg_av_none: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[0:1], v[4:5], off glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: cluster_acq_rel_cmpxchg_av_none: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v1 +; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s0, v0, v2 +; GFX12-WGP-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-CU-LABEL: cluster_acq_rel_cmpxchg_av_none: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v4, v1 +; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s0, v0, v2 +; GFX12-CU-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +entry: + %ret = cmpxchg ptr addrspace(1) %ptr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire, !mmra !0 + ret { i32, i1 } %ret +} + +!0 = !{!"amdgcn-av", !"none"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll new file mode 100644 index 0000000000000..fd5216f4b13d8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-av-unknown.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -o /dev/null %s 2>&1 | FileCheck %s + +; CHECK: warning: {{.*}}: unknown amdgcn-av metadata 'bogus' + +define void @test_unknown_av() { +entry: + fence seq_cst, !mmra !0 + ret void +} + +!0 = !{!"amdgcn-av", !"bogus"}