diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 25922c5af7da1..b721dcaf49d0f 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -442,17 +442,27 @@ class SICacheControl { SIAtomicAddrSpace AddrSpace, Position Pos) const = 0; - /// Inserts any necessary instructions at position \p Pos relative to - /// instruction \p MI to ensure previous memory instructions by this thread - /// with address spaces \p AddrSpace have completed and can be observed by - /// subsequent memory instructions by any thread executing in memory scope \p - /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is - /// between address spaces. Returns true iff any instructions inserted. - virtual bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const = 0; + /// Inserts any necessary writeback instructions at position \p Pos relative + /// to instruction \p MI to make previous memory operations by this thread + /// with address spaces \p AddrSpace available to other threads in memory + /// scope \p Scope. Does not insert waits; callers must call insertWait + /// separately. Returns true iff any instructions inserted. + virtual bool insertWriteback(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, + Position Pos) const = 0; + + /// Inserts writeback followed by an unconditional wait to implement a + /// release operation. + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + Changed |= insertWriteback(MI, Scope, AddrSpace, Pos); + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, + AtomicOrdering::Release, /*AtomicsOnly=*/false); + return Changed; + } /// Handle operations that are considered non-volatile. /// See \ref isNonVolatileMemoryAccess @@ -496,11 +506,9 @@ class SIGfx6CacheControl final : public SICacheControl { SIAtomicAddrSpace AddrSpace, Position Pos) const override; - bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const override; + bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; }; /// Generates code sequences for the memory model of GFX10/11. @@ -537,12 +545,10 @@ class SIGfx10CacheControl final : public SICacheControl { bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; - bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const override { - return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, - /*AtomicsOnly=*/false); + bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override { + return false; } }; @@ -594,9 +600,9 @@ class SIGfx12CacheControl final : public SICacheControl { bool handleCooperativeAtomic(MachineInstr &MI) const override; - bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const override; + bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -1464,67 +1470,56 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; +bool SIGfx6CacheControl::insertWriteback(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!ST.hasGFX90AInsts()) + return false; - if (ST.hasGFX90AInsts()) { - MachineBasicBlock &MBB = *MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); - if (Pos == Position::AFTER) - ++MI; + if (Pos == Position::AFTER) + ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by - // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. + if (canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by + // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + Changed = true; + break; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); Changed = true; - break; - case SIAtomicScope::AGENT: - if (ST.hasGFX940Insts()) { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::AGENT, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)". - Changed = true; - } - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it - // would writeback, and would require an otherwise unnecessary - // "S_WAITCNT vmcnt(0)". - break; - default: - llvm_unreachable("Unsupported synchronization scope"); } + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it + // would writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - - if (Pos == Position::AFTER) - --MI; } - // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other - // S_WAITCNT needed. - Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, - /*AtomicsOnly=*/false); + if (Pos == Position::AFTER) + --MI; return Changed; } @@ -2068,75 +2063,66 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } -bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); - +bool SIGfx12CacheControl::insertWriteback(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { // The scratch address space does not need the global memory cache // writeback as all memory operations by the same thread are // sequentially consistent, and no other thread can access scratch // memory. - if (canAffectGlobalAddrSpace(AddrSpace)) { - if (Pos == Position::AFTER) - ++MI; + if (!canAffectGlobalAddrSpace(AddrSpace)) + return false; - // global_wb is only necessary at system scope for GFX12.0, - // they're also necessary at device scope for GFX12.5 as stores - // cannot report completion earlier than L2. - // - // Emitting it for lower scopes is a slow no-op, so we omit it - // for performance. - std::optional NeedsWB; - switch (Scope) { - case SIAtomicScope::SYSTEM: - NeedsWB = AMDGPU::CPol::SCOPE_SYS; - break; - case SIAtomicScope::AGENT: - // GFX12.5 may have >1 L2 per device so we must emit a device scope WB. - if (ST.hasGFX1250Insts()) - NeedsWB = AMDGPU::CPol::SCOPE_DEV; - break; - case SIAtomicScope::CLUSTER: - case SIAtomicScope::WORKGROUP: - // No WB necessary, but we still have to wait. - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No WB or wait necessary here, but insertWait takes care of that. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); - if (NeedsWB) { - // Target requires a waitcnt to ensure that the proceeding store - // proceeding store/rmw operations have completed in L2 so their data will - // be written back by the WB instruction. - if (ST.hasINVWBL2WaitCntRequirement()) - insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - /*IsCrossAddrSpaceOrdering=*/false, Pos, - AtomicOrdering::Release, - /*AtomicsOnly=*/false); - - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB); - Changed = true; + if (Pos == Position::AFTER) + ++MI; + + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5 as stores + // cannot report completion earlier than L2. + // + // Emitting it for lower scopes is a slow no-op, so we omit it + // for performance. + std::optional NeedsWB; + switch (Scope) { + case SIAtomicScope::SYSTEM: + NeedsWB = AMDGPU::CPol::SCOPE_SYS; + break; + case SIAtomicScope::AGENT: + // GFX12.5 may have >1 L2 per device so we must emit a device scope WB. + if (ST.hasGFX1250Insts()) + NeedsWB = AMDGPU::CPol::SCOPE_DEV; + break; + case SIAtomicScope::CLUSTER: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + if (NeedsWB) { + // Target requires a waitcnt to ensure that the proceeding store + // proceeding store/rmw operations have completed in L2 so their data will + // be written back by the WB instruction. + if (ST.hasINVWBL2WaitCntRequirement()) { + insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + /*IsCrossAddrSpaceOrdering=*/false, Pos, + AtomicOrdering::Release, + /*AtomicsOnly=*/false); } - if (Pos == Position::AFTER) - --MI; + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB); + Changed = true; } - // We always have to wait for previous memory operations (load/store) to - // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), - // we of course need to wait for that as well. - Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, - /*AtomicsOnly=*/false); + if (Pos == Position::AFTER) + --MI; return Changed; }