-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys #82996
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys #82996
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesInsert waitcnts for loads and atomics before stores with system scope. Full diff: https://github.com/llvm/llvm-project/pull/82996.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d774826c1d08c0..a8a33a5fecb413 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return AMDGPU::S_WAIT_BVHCNT;
case AMDGPU::S_WAIT_DSCNT_soft:
return AMDGPU::S_WAIT_DSCNT;
+ case AMDGPU::S_WAIT_KMCNT_soft:
+ return AMDGPU::S_WAIT_KMCNT;
default:
return Opcode;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e42b..8dba1fc8398442 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -312,6 +312,10 @@ class SICacheControl {
SIMemOp Op, bool IsVolatile,
bool IsNonTemporal) const = 0;
+ virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+ return false;
+ };
+
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -603,6 +607,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
+
+ bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2381,6 +2387,34 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx12CacheControl::expandSystemScopeStore(
+ MachineBasicBlock::iterator &MI) const {
+
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) {
+ // Stores with system scope (SCOPE_SYS) need to wait for:
+ // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
+ // - non-returning-atomics - wait for STORECNT==0
+ // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
+ // since it does not distinguish atomics-with-return from regular stores.
+
+ // There is no need to wait if memory is cached (mtype != UC).
+ // For example shader-visible memory is cached.
+ // TODO: implement flag for frontend to give us a hint not to insert waits.
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
+ return true;
+ }
+
+ return false;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
@@ -2467,6 +2501,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
Changed |= CC->enableVolatileAndOrNonTemporal(
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
MOI.isNonTemporal());
+
+ // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
+ // instruction field, do not confuse it with atomic scope.
+ Changed |= CC->expandSystemScopeStore(MI);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0fe2845f8edc31..b5de311f8c58ce 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
diff --git a/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll
new file mode 100644
index 00000000000000..ed06b8aef6cac9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: intrinsic_store_system_scope:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
+ ret void
+}
+
+declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir
new file mode 100644
index 00000000000000..463a70a555a192
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir
@@ -0,0 +1,22 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name: intrinsic_store_system_scope
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX12-LABEL: name: intrinsic_store_system_scope
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX12-NEXT: S_ENDPGM 0
+ BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ S_ENDPGM 0
+...
|
|
||
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is | ||
// instruction field, do not confuse it with atomic scope. | ||
Changed |= CC->expandSystemScopeStore(MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this also needed for atomic stores? They returned early on line 2495 so they won't hit this code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I know no, needed only non-atomic stores
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see why it wouldn't be needed on a store release? store release already waits, but it doesn't (explicitly) wait for all the counters we need to wait on for expandSystemScopeStore
// For example shader-visible memory is cached. | ||
// TODO: implement flag for frontend to give us a hint not to insert waits. | ||
MachineBasicBlock &MBB = *MI->getParent(); | ||
DebugLoc DL = MI->getDebugLoc(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const ref
@@ -0,0 +1,16 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-global-isel=0
MachineBasicBlock::iterator &MI) const { | ||
|
||
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); | ||
if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
invert condition & early return
b41acdc
to
498816b
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
498816b
to
8a6e770
Compare
Volatile stores set scope_sys, insert waits there also |
@@ -2364,6 +2396,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( | |||
if (IsVolatile) { | |||
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); | |||
|
|||
if (Op == SIMemOp::STORE) | |||
Changed |= insertWaitsBeforeSystemScopeStore(MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a bit messy that we need this extra call to insertWaitsBeforeSystemScopeStore here, because the call to insertWait below modifies MI so it no longer refers to the store. But I guess it is OK.
// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits | ||
// since it does not distinguish atomics-with-return from regular stores. | ||
// There is no need to wait if memory is cached (mtype != UC). | ||
// For example shader-visible memory is cached. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the statement that "shader-visible memory is cached". Surely we are compiling a shader, so any memory the shader refers to is "shader-visible", so why do we need to worry about uncached memory?
MachineBasicBlock::iterator &MI) const { | ||
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); | ||
if (!CPol || ((CPol->getImm() & CPol::SCOPE) != CPol::SCOPE_SYS)) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Returning early no longer makes this code any shorter or clearer.
Insert waitcnts for loads and atomics before stores with system scope. Scope is field in instruction encoding and corresponds to desired coherence level in cache hierarchy. Intrinsic stores can set scope in cache policy operand. If volatile keyword is used on generic stores memory legalizer will set scope to system. Generic stores, by default, get lowest scope level. Waitcnts are not required if it is guaranteed that memory is cached. For example vulkan shaders can guarantee this. TODO: implement flag for frontends to give us a hint not to insert waits. Expecting vulkan flag to be implemented as vulkan:private MMRA.
8a6e770
to
d784fc6
Compare
Insert waitcnts for loads and atomics before stores with system scope.
Scope is field in instruction encoding and corresponds to desired
coherence level in cache hierarchy.
Intrinsic stores can set scope in cache policy operand.
If volatile keyword is used on generic stores memory legalizer will set
scope to system. Generic stores, by default, get lowest scope level.
Waitcnts are not required if it is guaranteed that memory is cached.
For example vulkan shaders can guarantee this.
TODO: implement flag for frontends to give us a hint not to insert waits.
Expecting vulkan flag to be implemented as vulkan:private MMRA.