-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes #168058
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes #168058
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesAlso breaks the long inheritance chains by making both With this patch, we now just have 3 Full diff: https://github.com/llvm/llvm-project/pull/168058.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index b3c56c325e210..562d21a0cd6cf 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -404,7 +404,7 @@ class SICacheControl {
/// Generates code sequences for the memory model of all GFX targets below
/// GFX10.
-class SIGfx6CacheControl : public SICacheControl {
+class SIGfx6CacheControl final : public SICacheControl {
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl {
Position Pos) const override;
};
-class SIGfx10CacheControl : public SIGfx6CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
public:
- SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override {
+ return false;
+ }
+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal,
@@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl {
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
-};
-
-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
- SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
- bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const override;
-
- bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal,
- bool IsLastUse) const override;
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
+ }
};
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
protected:
// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
// \returns Returns true if \p MI is modified, false otherwise.
@@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
@@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx6CacheControl>(ST);
- if (Generation < AMDGPUSubtarget::GFX11)
- return std::make_unique<SIGfx10CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
- return std::make_unique<SIGfx11CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx12CacheControl>(ST);
}
@@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
}
bool SIGfx10CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
@@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::AGENT:
// Set the L0 and L1 cache policies to MISS_EVICT.
// Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
+ // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+ Changed |=
+ enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
}
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
Changed |= enableCPolBits(MI, CPol::GLC);
Changed |= enableCPolBits(MI, CPol::SLC);
+ // GFX11: Set MALL NOALLOC for both load and store instructions.
+ if (AMDGPU::isGFX11(ST))
+ Changed |= enableCPolBits(MI, CPol::DLC);
+
return Changed;
}
@@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx11CacheControl::enableLoadCacheBypass(
- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace) const {
- assert(MI->mayLoad() && !MI->mayStore());
- bool Changed = false;
-
- if (canAffectGlobalAddrSpace(AddrSpace)) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- // Set the L0 and L1 cache policies to MISS_EVICT.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- Changed |= enableCPolBits(MI, CPol::GLC);
- break;
- case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode all waves of a work-group are on the same CU, and so the L0
- // does not need to be bypassed.
- if (!ST.isCuModeEnabled())
- Changed |= enableCPolBits(MI, CPol::GLC);
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to bypass.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
- }
-
- /// The scratch address space does not need the global memory caches
- /// to be bypassed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not have a cache.
-
- return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
- // Only handle load and store, not atomic read-modify-write insructions. The
- // latter use glc to indicate if the atomic returns a result and so must not
- // be used for cache control.
- assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
- // Only update load and store, not LLVM IR atomic read-modify-write
- // instructions. The latter are always marked as volatile so cannot sensibly
- // handle it as do not want to pessimize all atomics. Also they do not support
- // the nontemporal attribute.
- assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
- bool Changed = false;
-
- if (IsVolatile) {
- // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
- // and MISS_LRU for store instructions.
- // Note: there is no L2 cache coherent bypass control at the ISA level.
- if (Op == SIMemOp::LOAD)
- Changed |= enableCPolBits(MI, CPol::GLC);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableCPolBits(MI, CPol::DLC);
-
- // Ensure operation has completed at system scope to cause all volatile
- // operations to be visible outside the program in a global order. Do not
- // request cross address space as only the global address space can be
- // observable outside the program, so no need to cause a waitcnt for LDS
- // address space operations.
- Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered,
- /*AtomicsOnly=*/false);
- return Changed;
- }
-
- if (IsNonTemporal) {
- // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
- // and L2 cache policy to STREAM.
- // For stores setting both GLC and SLC configures L0 and L1 cache policy
- // to MISS_EVICT and the L2 cache policy to STREAM.
- if (Op == SIMemOp::STORE)
- Changed |= enableCPolBits(MI, CPol::GLC);
- Changed |= enableCPolBits(MI, CPol::SLC);
-
- // Set MALL NOALLOC for load and store instructions.
- Changed |= enableCPolBits(MI, CPol::DLC);
- return Changed;
- }
-
- return Changed;
-}
-
bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
|
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
f0a6070 to
e060c5e
Compare
6900721 to
b092e9c
Compare
+ Break the long inheritance chains by making both `SIGfx10CacheControl` and `SIGfx12CacheControl` inherit from `SICacheControl`. With this patch and the previous one, there is no more long inheritance chain in `SIMemoryLegalizer`. We just have 3 `SICacheControl` implementations that each do their own thing, and there is no more code hidden 3 superclasses above. All implementations are marked `final` too.
e060c5e to
b1e645b
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/27211 Here is the relevant piece of the build log for the reference |

Also breaks the long inheritance chains by making both
SIGfx10CacheControlandSIGfx12CacheControlinherit fromSICacheControldirectly.With this patch, we now just have 3
SICacheControlimplementations that eachdo their own thing, and there is no more code hidden 3 superclasses above (which made this code harder to read and maintain than it needed to be).