diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0dac6d2d7ee4b..49aba39872138 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -302,16 +302,17 @@ class SICacheControl { SICacheControl(const GCNSubtarget &ST); - /// Sets named bit \p BitName to "true" if present in instruction \p MI. + /// Sets CPol \p Bits to "true" if present in instruction \p MI. /// \returns Returns true if \p MI is modified, false otherwise. - bool enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const; + bool enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const; /// Check if any atomic operation on AS can affect memory accessible via the /// global address space. bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const; public: + using CPol = AMDGPU::CPol::CPol; /// Create a cache control for the subtarget \p ST. static std::unique_ptr create(const GCNSubtarget &ST); @@ -401,21 +402,9 @@ class SICacheControl { virtual ~SICacheControl() = default; }; +/// Generates code sequences for the memory model of all GFX targets below +/// GFX10. class SIGfx6CacheControl : public SICacheControl { -protected: - - /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::GLC); - } - - /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SLC); - } - public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -454,114 +443,9 @@ class SIGfx6CacheControl : public SICacheControl { Position Pos) const override; }; -class SIGfx7CacheControl : public SIGfx6CacheControl { -public: - - SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - -}; - -class SIGfx90ACacheControl : public SIGfx7CacheControl { -public: - - SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; - - bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order, bool AtomicsOnly) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx940CacheControl : public SIGfx90ACacheControl { -protected: - - /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC0); - } - - /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC1); - } - - /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableNTBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::NT); - } - -public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx10CacheControl : public SIGfx7CacheControl { -protected: - - /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::DLC); - } - +class SIGfx10CacheControl : public SIGfx6CacheControl { public: - - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} + SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -601,6 +485,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { // \returns Returns true if \p MI is modified, false otherwise. bool setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p // MI. \returns Returns true if \p MI is modified, false otherwise. bool setScope(const MachineBasicBlock::iterator MI, @@ -1006,13 +891,13 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { InsertCacheInv = !AmdgcnSkipCacheInvalidations; } -bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const { +bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const { MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); if (!CPol) return false; - CPol->setImm(CPol->getImm() | Bit); + CPol->setImm(CPol->getImm() | Bits); return true; } @@ -1028,14 +913,8 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const { /* static */ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); - if (ST.hasGFX940Insts()) - return std::make_unique(ST); - if (ST.hasGFX90AInsts()) - return std::make_unique(ST); - if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX10) - return std::make_unique(ST); + return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX11) return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX12) @@ -1048,33 +927,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: + if (!canAffectGlobalAddrSpace(AddrSpace)) { + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + return false; + } + + bool Changed = false; + switch (Scope) { + case SIAtomicScope::SYSTEM: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + } + [[fallthrough]]; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + } else { // Set L1 cache policy to MISS_EVICT. // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); + Changed |= enableCPolBits(MI, CPol::GLC); + } + break; + case SIAtomicScope::WORKGROUP: + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. Setting + // SC bits to indicate work-group scope will do this automatically. + Changed |= enableCPolBits(MI, CPol::SC0); + } else if (ST.hasGFX90AInsts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. + if (ST.isTgSplitEnabled()) + Changed |= enableCPolBits(MI, CPol::GLC); } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - return Changed; } @@ -1085,8 +992,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass( assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// The L1 cache is write through so does not need to be bypassed. There is no - /// bypass control for the L2 cache at the isa level. + /// For targets other than GFX940, the L1 cache is write through so does not + /// need to be bypassed. There is no bypass control for the L2 cache at the + /// isa level. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::WORKGROUP: + // Set SC bits to indicate workgroup scope. + Changed |= enableCPolBits(MI, CPol::SC0); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + } return Changed; } @@ -1098,10 +1036,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically - /// bypassed, and the GLC bit is instead used to indicate if they are - /// return or no-return. - /// Note: there is no L2 cache coherent bypass control at the ISA level. + /// For targets other than GFX940, do not set GLC for RMW atomic operations as + /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to + /// indicate if they are return or no-return. Note: there is no L2 cache + /// coherent bypass control at the ISA level. + /// For GFX90A+, RMW atomics implicitly bypass the L1 cache. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC1 bit to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // RMW atomic operations implicitly bypass the L1 cache and only use SC1 + // to indicate system or agent scope. The SC0 bit is used to indicate if + // they are return or no-return. Leave SC1 bit unset to indicate agent + // scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } return Changed; } @@ -1123,11 +1082,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + } else if (Op == SIMemOp::LOAD) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. + Changed |= enableCPolBits(MI, CPol::GLC); + } // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not @@ -1142,10 +1105,13 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( } if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + if (ST.hasGFX940Insts()) { + Changed |= enableCPolBits(MI, CPol::NT); + } else { + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. + Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC); + } return Changed; } @@ -1166,6 +1132,26 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; + // GFX90A+ + if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) { + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to wait for global or GDS memory operations + // to complete to ensure they are visible to waves in the other CUs. + // Otherwise in non-threadgroup split mode all waves of a work-group are on + // the same CU, so no need to wait for global memory as all waves in the + // work-group access the same the L1, nor wait for GDS as access are ordered + // on a CU. + if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && + (Scope == SIAtomicScope::WORKGROUP)) { + // Same as get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } + + if (ST.hasGFX90AInsts()) { + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW + // and CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed + // to remove any cache lines of earlier writes by the same wave and + // ensures later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } + [[fallthrough]]; case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); + if (ST.hasGFX940Insts()) { + // Ensures that following loads will not see stale remote date or local + // MTYPE NC global data. Local MTYPE RW and CC memory will never be + // stale due to the memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + } else + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); Changed = true; break; case SIAtomicScope::WORKGROUP: + if (ST.isTgSplitEnabled()) { + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be + // executing on different CUs. Therefore need to invalidate the L1 + // which is per CU. Otherwise in non-threadgroup split mode all waves + // of a work-group are on the same CU, and so the L1 does not need to + // be invalidated. + + // Ensures L1 is invalidated if in threadgroup split mode. In + // non-threadgroup split mode it is a NOP, but no point generating it + // in that case if know not in that mode. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate work-group scope. + .addImm(AMDGPU::CPol::SC0); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding + // buffer invalidate. The invalidate is guaranteed to remove any cache + // lines of earlier writes and ensures later writes will refetch the + // cache lines. + Changed = true; + } else if (ST.hasGFX90AInsts()) { + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + Changed = true; + } + } + break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. + // For GFX940, we could generate "BUFFER_INV" but it would do nothing as + // there are no caches to invalidate. All other targets have no cache to + // invalidate. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1310,618 +1377,57 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const { - return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, - /*AtomicsOnly=*/false); -} - -bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - const GCNSubtarget &STM = MBB.getParent()->getSubtarget(); + if (ST.hasGFX90AInsts()) { + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); - const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() - ? AMDGPU::BUFFER_WBINVL1 - : AMDGPU::BUFFER_WBINVL1_VOL; - - if (Pos == Position::AFTER) - ++MI; + if (Pos == Position::AFTER) + ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); + if (canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by + // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + Changed = true; + break; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::AGENT, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)". + Changed = true; + } + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it + // would writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } } - } - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx90ACacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L1 cache policy to MISS_LRU. - // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. - if (ST.isTgSplitEnabled()) - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx90ACacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - /// Do not set glc for RMW atomic operations as they implicitly bypass - /// the L1 cache, and the glc bit is instead used to indicate if they are - /// return or no-return. - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. RMW atomics implicitly bypass the L1 cache. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - - return Changed; - } - - if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order, - bool AtomicsOnly) const { - if (ST.isTgSplitEnabled()) { - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to wait for global or GDS memory operations - // to complete to ensure they are visible to waves in the other CUs. - // Otherwise in non-threadgroup split mode all waves of a work-group are on - // the same CU, so no need to wait for global memory as all waves in the - // work-group access the same the L1, nor wait for GDS as access are ordered - // on a CU. - if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && - (Scope == SIAtomicScope::WORKGROUP)) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; - } - // In threadgroup split mode LDS cannot be allocated so no need to wait for - // LDS memory operations. - AddrSpace &= ~SIAtomicAddrSpace::LDS; - } - return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, - IsCrossAddrSpaceOrdering, Pos, Order, - AtomicsOnly); -} - -bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::AGENT: - // Same as GFX7. - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. - if (ST.isTgSplitEnabled()) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; - } - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); - - return Changed; -} - -bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT - // vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - if (Pos == Position::AFTER) - --MI; - - Changed |= - SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, - IsCrossAddrSpaceOrdering, Pos); - - return Changed; -} - -bool SIGfx940CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. Setting SC - // bits to indicate work-group scope will do this automatically. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableStoreCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { - assert(!MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // Set SC bits to indicate workgroup scope. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC1 bit to indicate system scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // RMW atomic operations implicitly bypass the L1 cache and only use SC1 - // to indicate system or agent scope. The SC0 bit is used to indicate if - // they are return or no-return. Leave SC1 bit unset to indicate agent - // scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - - return Changed; - } - - if (IsNonTemporal) { - Changed |= enableNTBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::AGENT: - // Ensures that following loads will not see stale remote date or local - // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale - // due to the memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. - if (ST.isTgSplitEnabled()) { - // Ensures L1 is invalidated if in threadgroup split mode. In - // non-threadgroup split mode it is a NOP, but no point generating it in - // that case if know not in that mode. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate work-group scope. - .addImm(AMDGPU::CPol::SC0); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. - Changed = true; - } - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Could generate "BUFFER_INV" but it would do nothing as there are no - // caches to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::SYSTEM, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::AGENT, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)". - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Do not generate "BUFFER_WBL2" as there are no caches it would - // writeback, and would require an otherwise unnecessary - // "S_WAITCNT vmcnt(0)". - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } + if (Pos == Position::AFTER) + --MI; } - if (Pos == Position::AFTER) - --MI; - // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other // S_WAITCNT needed. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, @@ -1944,8 +1450,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1953,7 +1458,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( // CU mode all waves of a work-group are on the same CU, and so the L0 // does not need to be bypassed. if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -1996,8 +1501,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // and MISS_LRU for store instructions. // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) { - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } // Ensure operation has completed at system scope to cause all volatile @@ -2017,8 +1521,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // For stores setting both GLC and SLC configures L0 and L1 cache policy // to MISS_EVICT and the L2 cache policy to STREAM. if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); + Changed |= enableCPolBits(MI, CPol::SLC); return Changed; } @@ -2230,7 +1734,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -2238,7 +1742,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass( // CU mode all waves of a work-group are on the same CU, and so the L0 // does not need to be bypassed. if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -2281,10 +1785,10 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // and MISS_LRU for store instructions. // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::DLC); // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not @@ -2303,11 +1807,11 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // For stores setting both GLC and SLC configures L0 and L1 cache policy // to MISS_EVICT and the L2 cache policy to STREAM. if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); + Changed |= enableCPolBits(MI, CPol::SLC); // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::DLC); return Changed; }