diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 21419aab1a43fe..c03709194917a9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -280,23 +280,23 @@ class SICacheControl { virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const = 0; - /// Inserts any necessary instructions at position \p Pos relative - /// to instruction \p MI to ensure any caches associated with - /// address spaces \p AddrSpace for memory scopes up to memory scope - /// \p Scope are invalidated. Returns true iff any instructions - /// inserted. - virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const = 0; + /// Inserts any necessary instructions at position \p Pos relative to + /// instruction \p MI to ensure any subsequent memory instructions of this + /// thread with address spaces \p AddrSpace will observe the previous memory + /// operations by any thread for memory scopes up to memory scope \p Scope . + /// Returns true iff any instructions inserted. + virtual bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const = 0; /// Inserts any necessary instructions at position \p Pos relative - /// to instruction \p MI to ensure memory instructions of kind \p Op - /// associated with address spaces \p AddrSpace have completed as - /// observed by other memory instructions executing in memory scope - /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory - /// ordering is between address spaces. Returns true iff any - /// instructions inserted. + /// to instruction \p MI to ensure memory instructions before \p Pos of kind + /// \p Op associated with address spaces \p AddrSpace have completed. Used + /// between memory instructions to enforce the order they become visible as + /// observed by other memory instructions executing in memory scope \p Scope. + /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between + /// address spaces. Returns true iff any instructions inserted. virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, @@ -304,6 +304,18 @@ class SICacheControl { bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; + /// Inserts any necessary instructions at position \p Pos relative to + /// instruction \p MI to ensure previous memory instructions by this thread + /// with address spaces \p AddrSpace have completed and can be observed by + /// subsequent memory instructions by any thread executing in memory scope \p + /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is + /// between address spaces. Returns true iff any instructions inserted. + virtual bool insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const = 0; + /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; @@ -334,10 +346,16 @@ class SIGfx6CacheControl : public SICacheControl { bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; - bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -352,10 +370,10 @@ class SIGfx7CacheControl : public SIGfx6CacheControl { SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; - bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; }; @@ -380,10 +398,10 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; - bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -678,9 +696,6 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( bool Changed = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { - /// TODO: Do not set glc for rmw atomic operations as they - /// implicitly bypass the L1 cache. - switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -718,10 +733,10 @@ bool SIGfx6CacheControl::enableNonTemporal( return Changed; } -bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { +bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { if (!InsertCacheInv) return false; @@ -802,12 +817,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: case SIAtomicScope::WORKGROUP: - // If no cross address space ordering then an LDS waitcnt is not - // needed as LDS operations for all waves are executed in a - // total global ordering as observed by all waves. Required if - // also synchronizing with global/GDS memory as LDS operations - // could be reordered with respect to later global/GDS memory - // operations of the same wave. + // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is + // not needed as LDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/GDS memory as LDS operations could be + // reordered with respect to later global/GDS memory operations of the + // same wave. LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: @@ -824,12 +839,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - // If no cross address space ordering then an GDS waitcnt is not - // needed as GDS operations for all waves are executed in a - // total global ordering as observed by all waves. Required if - // also synchronizing with global/LDS memory as GDS operations - // could be reordered with respect to later global/LDS memory - // operations of the same wave. + // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" + // is not needed as GDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/LDS memory as GDS operations could be + // reordered with respect to later global/LDS memory operations of the + // same wave. LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: @@ -859,10 +874,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { +bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos); +} + +bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { if (!InsertCacheInv) return false; @@ -873,9 +897,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, const GCNSubtarget &STM = MBB.getParent()->getSubtarget(); - const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() - ? AMDGPU::BUFFER_WBINVL1 - : AMDGPU::BUFFER_WBINVL1_VOL; + const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() + ? AMDGPU::BUFFER_WBINVL1 + : AMDGPU::BUFFER_WBINVL1_VOL; if (Pos == Position::AFTER) ++MI; @@ -884,7 +908,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(Flush)); + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); Changed = true; break; case SIAtomicScope::WORKGROUP: @@ -902,7 +926,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, /// sequentially consistent, and no other thread can access scratch /// memory. - /// Other address spaces do not hava a cache. + /// Other address spaces do not have a cache. if (Pos == Position::AFTER) --MI; @@ -930,8 +954,8 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode and all waves of a work-group are on the same CU, and so the - // L0 does not need to be bypassed. + // CU mode all waves of a work-group are on the same CU, and so the L0 + // does not need to be bypassed. if (!CuMode) Changed |= enableGLCBit(MI); break; case SIAtomicScope::WAVEFRONT: @@ -964,10 +988,10 @@ bool SIGfx10CacheControl::enableNonTemporal( return Changed; } -bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { +bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { if (!InsertCacheInv) return false; @@ -1074,12 +1098,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: case SIAtomicScope::WORKGROUP: - // If no cross address space ordering then an LDS waitcnt is not - // needed as LDS operations for all waves are executed in a - // total global ordering as observed by all waves. Required if - // also synchronizing with global/GDS memory as LDS operations - // could be reordered with respect to later global/GDS memory - // operations of the same wave. + // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is + // not needed as LDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/GDS memory as LDS operations could be + // reordered with respect to later global/GDS memory operations of the + // same wave. LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: @@ -1096,12 +1120,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - // If no cross address space ordering then an GDS waitcnt is not - // needed as GDS operations for all waves are executed in a - // total global ordering as observed by all waves. Required if - // also synchronizing with global/LDS memory as GDS operations - // could be reordered with respect to later global/LDS memory - // operations of the same wave. + // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" + // is not needed as GDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/LDS memory as GDS operations could be + // reordered with respect to later global/LDS memory operations of the + // same wave. LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: @@ -1177,9 +1201,9 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); - Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - Position::AFTER); + Changed |= CC->insertAcquire(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::AFTER); } return Changed; @@ -1203,11 +1227,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= CC->insertWait(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + Changed |= CC->insertRelease(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); return Changed; } @@ -1239,19 +1262,23 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, /// ordering and memory scope, then library does not need to /// generate a fence. Could add support in this file for /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally - /// adding waitcnt before a S_BARRIER. - Changed |= CC->insertWait(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + /// adding S_WAITCNT before a S_BARRIER. + Changed |= CC->insertRelease(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); + + // TODO: If both release and invalidate are happening they could be combined + // to use the single "BUFFER_WBL2" instruction. This could be done by + // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to + // track cache invalidate and write back instructions. if (MOI.getOrdering() == AtomicOrdering::Acquire || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - Position::BEFORE); + Changed |= CC->insertAcquire(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::BEFORE); return Changed; } @@ -1270,11 +1297,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= CC->insertWait(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE); + Changed |= CC->insertRelease(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); if (MOI.getOrdering() == AtomicOrdering::Acquire || MOI.getOrdering() == AtomicOrdering::AcquireRelease || @@ -1287,9 +1313,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); - Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), - Position::AFTER); + Changed |= CC->insertAcquire(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::AFTER); } return Changed;