-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. #157843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesAdd new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, Patch is 26.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157843.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..f2c12c32860a6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -121,6 +121,7 @@ struct HardwareLimits {
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
+ DECL(SCC_WRITE) /* write to SCC from barrier */ \
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
@@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
@@ -163,6 +165,9 @@ enum RegisterMapping {
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
NUM_LDS_VGPRS = 9, // One more than the stores we track.
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
+ NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
+ // Remaining non-allocatable registers
+ SCC = NUM_ALL_ALLOCATABLE
};
// Enumerate different types of result-returning VMEM operations. Although
@@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+ eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
eventMask({VMEM_GROUP, SMEM_GROUP})};
return WaitEventMaskForInstGFX12Plus;
@@ -586,6 +591,7 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ bool asynchronouslyWritesSCC(unsigned Opcode) const;
};
// This objects maintains the current score brackets of each wait counter, and
@@ -626,7 +632,12 @@ class WaitcntBrackets {
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS)
return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+
+ if (GprNo < NUM_ALL_ALLOCATABLE)
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+
+ assert(GprNo == SCC);
+ return SCCScore;
}
bool merge(const WaitcntBrackets &Other);
@@ -646,6 +657,7 @@ class WaitcntBrackets {
AMDGPU::Waitcnt &Wait) const {
determineWait(T, {RegNo, RegNo + 1}, Wait);
}
+ void tryClearSCCWriteEvent(MachineInstr *Inst);
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -785,6 +797,10 @@ class WaitcntBrackets {
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
// X_CNT score.
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Reg score for SCC.
+ unsigned SCCScore = 0;
+ // The unique instruction that has an SCC write pending, if there is one.
+ const MachineInstr *PendingSCCWrite = nullptr;
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
const MachineOperand &Op) const {
+ if (Op.getReg() == AMDGPU::SCC)
+ return {SCC, SCC + 1};
+
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
@@ -864,9 +883,11 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
if (RegNo < NUM_ALL_VGPRS) {
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
- } else {
+ } else if (RegNo < NUM_ALL_ALLOCATABLE) {
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
+ } else { // SCC
+ SCCScore = Score;
}
}
}
@@ -1077,6 +1098,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (Slot)
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
+
+ if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+ setRegScore(SCC, T, CurrScore);
+ PendingSCCWrite = &Inst;
+ }
}
}
@@ -1145,6 +1171,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":s" << J << " ";
}
}
+ if (T == KM_CNT && SCCScore > 0)
+ OS << SCCScore << ":SCC ";
}
OS << '\n';
}
@@ -1219,6 +1247,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
}
}
+void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
+ // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
+ // SCC has landed
+ if (PendingSCCWrite &&
+ PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
+ PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
+ unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+ // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
+ if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
+ SCC_WRITE_PendingEvent) {
+ setScoreLB(KM_CNT, getScoreUB(KM_CNT));
+ }
+
+ PendingEvents &= ~SCC_WRITE_PendingEvent;
+ PendingSCCWrite = nullptr;
+ }
+}
+
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
@@ -1908,6 +1954,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait);
}
}
+ } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+ ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
@@ -2003,6 +2051,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+ } else if (Op.getReg() == AMDGPU::SCC) {
+ ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
@@ -2340,6 +2390,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+ } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
@@ -2350,9 +2402,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
- case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
- case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
@@ -2419,6 +2468,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == DS_CNT)
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
+ if (T == KM_CNT) {
+ StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
+ if (Other.hasPendingEvent(SCC_WRITE)) {
+ unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
+ if (!OldEventsHasSCCWrite) {
+ PendingSCCWrite = Other.PendingSCCWrite;
+ } else {
+ if (PendingSCCWrite != Other.PendingSCCWrite)
+ PendingSCCWrite = nullptr;
+ }
+ }
+ }
+
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
@@ -2450,6 +2512,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
+bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) const {
+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+}
+
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
index 248e0c716b975..7ff13908eda4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -12,10 +12,10 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
@@ -27,13 +27,86 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ ret i1 %r
+}
+
+define i1 @signal_isfirst_same_barrier_wait() {
+; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_wait -1
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_barrier_wait -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret i1 %r
+}
+
+define i1 @signal_isfirst_different_barrier_wait() {
+; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_wait 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_barrier_wait 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 0)
ret i1 %r
}
+declare void @llvm.amdgcn.s.barrier.wait(i16)
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir
new file mode 100644
index 0000000000000..3972553867ab9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir
@@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: signal_isfirst_imm_same_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_imm_same_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT -1
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT -1
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_imm_different_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_imm_different_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT 0
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_m0_same_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_m0_same_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: $m0 = S_MOV_B32 -1
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT -1
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT -1
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+name: signal_isfirst_m0_different_barrier_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: signal_isfirst_m0_different_barrier_wait
+ ; GCN: S_WAIT_LOADCNT_DSCNT 0
+ ; GCN-NEXT: S_WAIT_EXPCNT 0
+ ; GCN-NEXT: S_WAIT_SAMPLECNT 0
+ ; GCN-NEXT: S_WAIT_BVHCNT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; GCN-NEXT: $m0 = S_MOV_B32 -1
+ ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ ; GCN-NEXT: S_BARRIER_WAIT 0
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc
+ S_BARRIER_WAIT 0
+ renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index a4fa8e4b3c8e2..8a9beb73a6baa 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -155,6 +155,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_barrier_signal -1
; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_barrier_leave
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
new file mode 100644
index 0000000000000..33085bfa2cc96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir
@@ -0,0 +1,173 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name: scc_write_in_other_block
+body: |
+ ; GFX12-LABEL: name: scc_write_in_other_block
+ ; GFX12: bb.0:
+ ; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0
+ ; GFX12-NEXT: S_WAIT_EXPCNT 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+ ; GFX12-NEXT: S_WAIT_BVHCNT 0
+ ; GFX1...
[truncated]
|
DECL(LDS_ACCESS) /* lds read & write */ \ | ||
DECL(GDS_ACCESS) /* gds read & write */ \ | ||
DECL(SQ_MESSAGE) /* send message */ \ | ||
DECL(SCC_WRITE) /* write to SCC from barrier */ \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
...Is this the same SCC as the condition register?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes
5f13830
to
2396af4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
} | ||
} | ||
if (T == KM_CNT && SCCScore > 0) | ||
OS << SCCScore << ":SCC "; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit:
OS << SCCScore << ":SCC "; | |
OS << SCCScore << ":scc "; |
…tes. Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, instructions that write to SCC, counter is KM_CNT. Also start tracking SCC for reads and writes. s_barrier_wait on the same barrier guarantees that the SCC write from s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
2396af4
to
6e27c8e
Compare
Merge activity
|
…tes. (llvm#157843) Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, instructions that write to SCC, counter is KM_CNT. Also start tracking SCC for reads and writes. s_barrier_wait on the same barrier guarantees that the SCC write from s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.
Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave,
instructions that write to SCC, counter is KM_CNT.
Also start tracking SCC for reads and writes.
s_barrier_wait on the same barrier guarantees that the SCC write from
s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt.