From 6e27c8e3f98db5c087603e50a037403d9600a6cf Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 10 Sep 2025 14:23:37 +0200 Subject: [PATCH] [AMDGPU][SIInsertWaitcnts] Track SCC. Insert KM_CNT waits for SCC writes. Add new event SCC_WRITE for s_barrier_signal_isfirst and s_barrier_leave, instructions that write to SCC, counter is KM_CNT. Also start tracking SCC for reads and writes. s_barrier_wait on the same barrier guarantees that the SCC write from s_barrier_signal_isfirst has landed, no need to insert s_wait_kmcnt. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 81 +++++++- .../llvm.amdgcn.s.barrier.signal.isfirst.ll | 75 +++++++- .../llvm.amdgcn.s.barrier.signal.isfirst.mir | 105 +++++++++++ llvm/test/CodeGen/AMDGPU/s-barrier.ll | 1 + .../waitcnt-kmcnt-scc-different-block.mir | 173 ++++++++++++++++++ 5 files changed, 428 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ae75fb529dade..fcc3ffc7c5442 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -121,6 +121,7 @@ struct HardwareLimits { DECL(LDS_ACCESS) /* lds read & write */ \ DECL(GDS_ACCESS) /* gds read & write */ \ DECL(SQ_MESSAGE) /* send message */ \ + DECL(SCC_WRITE) /* write to SCC from barrier */ \ DECL(SMEM_ACCESS) /* scalar-memory read & write */ \ DECL(SMEM_GROUP) /* scalar-memory group */ \ DECL(EXP_GPR_LOCK) /* export holding on its data src */ \ @@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = { // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs +// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { @@ -163,6 +165,9 @@ enum RegisterMapping { FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores. NUM_LDS_VGPRS = 9, // One more than the stores we track. NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start. + NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS, + // Remaining non-allocatable registers + SCC = NUM_ALL_ALLOCATABLE }; // Enumerate different types of result-returning VMEM operations. Although @@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), eventMask({VMEM_SAMPLER_READ_ACCESS}), eventMask({VMEM_BVH_READ_ACCESS}), - eventMask({SMEM_ACCESS, SQ_MESSAGE}), + eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), eventMask({VMEM_GROUP, SMEM_GROUP})}; return WaitEventMaskForInstGFX12Plus; @@ -586,6 +591,7 @@ class SIInsertWaitcnts { WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); + static bool asynchronouslyWritesSCC(unsigned Opcode); }; // This objects maintains the current score brackets of each wait counter, and @@ -626,7 +632,12 @@ class WaitcntBrackets { unsigned getRegScore(int GprNo, InstCounterType T) const { if (GprNo < NUM_ALL_VGPRS) return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + + if (GprNo < NUM_ALL_ALLOCATABLE) + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + + assert(GprNo == SCC); + return SCCScore; } bool merge(const WaitcntBrackets &Other); @@ -646,6 +657,7 @@ class WaitcntBrackets { AMDGPU::Waitcnt &Wait) const { determineWait(T, {RegNo, RegNo + 1}, Wait); } + void tryClearSCCWriteEvent(MachineInstr *Inst); void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); @@ -785,6 +797,10 @@ class WaitcntBrackets { // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the // X_CNT score. unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Reg score for SCC. + unsigned SCCScore = 0; + // The unique instruction that has an SCC write pending, if there is one. + const MachineInstr *PendingSCCWrite = nullptr; // Bitmask of the VmemTypes of VMEM instructions that might have a pending // write to each vgpr. unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; @@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, const MachineOperand &Op) const { + if (Op.getReg() == AMDGPU::SCC) + return {SCC, SCC + 1}; + if (!TRI->isInAllocatableClass(Op.getReg())) return {-1, -1}; @@ -864,9 +883,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval, if (RegNo < NUM_ALL_VGPRS) { VgprUB = std::max(VgprUB, RegNo); VgprScores[CntTy][RegNo] = Score; - } else { + } else if (RegNo < NUM_ALL_ALLOCATABLE) { SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS); SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score; + } else { + assert(RegNo == SCC); + SCCScore = Score; } } } @@ -1077,6 +1099,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, if (Slot) setRegScore(FIRST_LDS_VGPR, T, CurrScore); } + + if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) { + setRegScore(SCC, T, CurrScore); + PendingSCCWrite = &Inst; + } } } @@ -1145,6 +1172,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const { OS << RelScore << ":s" << J << " "; } } + if (T == KM_CNT && SCCScore > 0) + OS << SCCScore << ":scc "; } OS << '\n'; } @@ -1219,6 +1248,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, } } +void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) { + // S_BARRIER_WAIT on the same barrier guarantees that the pending write to + // SCC has landed + if (PendingSCCWrite && + PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM && + PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) { + unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE; + // If this SCC_WRITE is the only pending KM_CNT event, clear counter. + if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) == + SCC_WRITE_PendingEvent) { + setScoreLB(KM_CNT, getScoreUB(KM_CNT)); + } + + PendingEvents &= ~SCC_WRITE_PendingEvent; + PendingSCCWrite = nullptr; + } +} + void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(LOAD_CNT, Wait.LoadCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); @@ -1908,6 +1955,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait); } } + } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) { + ScoreBrackets.tryClearSCCWriteEvent(&MI); } else { // FIXME: Should not be relying on memoperands. // Look at the source operands of every instruction to see if @@ -1997,6 +2046,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(EXP_CNT, Interval, Wait); } ScoreBrackets.determineWait(DS_CNT, Interval, Wait); + } else if (Op.getReg() == AMDGPU::SCC) { + ScoreBrackets.determineWait(KM_CNT, Interval, Wait); } else { ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } @@ -2334,6 +2385,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); + } else if (asynchronouslyWritesSCC(Inst.getOpcode())) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: @@ -2344,9 +2397,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, break; case AMDGPU::S_MEMTIME: case AMDGPU::S_MEMREALTIME: - case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: - case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: - case AMDGPU::S_BARRIER_LEAVE: case AMDGPU::S_GET_BARRIER_STATE_M0: case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); @@ -2413,6 +2463,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { if (T == DS_CNT) StrictDom |= mergeScore(M, LastGDS, Other.LastGDS); + if (T == KM_CNT) { + StrictDom |= mergeScore(M, SCCScore, Other.SCCScore); + if (Other.hasPendingEvent(SCC_WRITE)) { + unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); + if (!OldEventsHasSCCWrite) { + PendingSCCWrite = Other.PendingSCCWrite; + } else { + if (PendingSCCWrite != Other.PendingSCCWrite) + PendingSCCWrite = nullptr; + } + } + } + for (int J = 0; J <= VgprUB; J++) StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); @@ -2444,6 +2507,12 @@ static bool isWaitInstr(MachineInstr &Inst) { counterTypeForInstr(Opcode).has_value(); } +bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) { + return Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; +} + // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll index 248e0c716b975..7ff13908eda4f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll @@ -12,10 +12,10 @@ define i1 @func1() { ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 ; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: func1: @@ -27,13 +27,86 @@ define i1 @func1() { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 ; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + ret i1 %r +} + +define i1 @signal_isfirst_same_barrier_wait() { +; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_barrier_wait -1 +; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_barrier_wait -1 ; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 -1) + ret i1 %r +} + +define i1 @signal_isfirst_different_barrier_wait() { +; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_barrier_wait 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_barrier_wait 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 0) ret i1 %r } +declare void @llvm.amdgcn.s.barrier.wait(i16) declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir new file mode 100644 index 0000000000000..3972553867ab9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.mir @@ -0,0 +1,105 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: signal_isfirst_imm_same_barrier_wait +body: | + bb.0: + ; GCN-LABEL: name: signal_isfirst_imm_same_barrier_wait + ; GCN: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_EXPCNT 0 + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + ; GCN-NEXT: S_BARRIER_WAIT -1 + ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_CMP_EQ_U32 0, 0, implicit-def $scc + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + S_BARRIER_WAIT -1 + renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: signal_isfirst_imm_different_barrier_wait +body: | + bb.0: + ; GCN-LABEL: name: signal_isfirst_imm_different_barrier_wait + ; GCN: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_EXPCNT 0 + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + ; GCN-NEXT: S_BARRIER_WAIT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_CMP_EQ_U32 0, 0, implicit-def $scc + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + S_BARRIER_WAIT 0 + renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: signal_isfirst_m0_same_barrier_wait +body: | + bb.0: + ; GCN-LABEL: name: signal_isfirst_m0_same_barrier_wait + ; GCN: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_EXPCNT 0 + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc + ; GCN-NEXT: S_BARRIER_WAIT -1 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_CMP_EQ_U32 0, 0, implicit-def $scc + $m0 = S_MOV_B32 -1 + S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc + S_BARRIER_WAIT -1 + renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: signal_isfirst_m0_different_barrier_wait +body: | + bb.0: + ; GCN-LABEL: name: signal_isfirst_m0_different_barrier_wait + ; GCN: S_WAIT_LOADCNT_DSCNT 0 + ; GCN-NEXT: S_WAIT_EXPCNT 0 + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc + ; GCN-NEXT: S_BARRIER_WAIT 0 + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + ; GCN-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_CMP_EQ_U32 0, 0, implicit-def $scc + $m0 = S_MOV_B32 -1 + S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit killed $scc + S_BARRIER_WAIT 0 + renamable $sgpr0 = S_CSELECT_B32 -1, 0, implicit killed $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index a4fa8e4b3c8e2..8a9beb73a6baa 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -155,6 +155,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_barrier_join m0 ; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_barrier_leave diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir new file mode 100644 index 0000000000000..33085bfa2cc96 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-kmcnt-scc-different-block.mir @@ -0,0 +1,173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s + +--- +name: scc_write_in_other_block +body: | + ; GFX12-LABEL: name: scc_write_in_other_block + ; GFX12: bb.0: + ; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.1: + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc + ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + bb.0: + S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec + + bb.2: + renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: scc_write_in_other_block_with_barrier_wait +body: | + ; GFX12-LABEL: name: scc_write_in_other_block_with_barrier_wait + ; GFX12: bb.0: + ; GFX12-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.1: + ; GFX12-NEXT: successors: %bb.2(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc + ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: S_BARRIER_WAIT -1 + ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + bb.0: + S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit killed $scc + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + GLOBAL_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, implicit $exec + + bb.2: + S_BARRIER_WAIT -1 + renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: scc_write_in_multiple_blocks_with_barrier_wait +body: | + ; GFX12-LABEL: name: scc_write_in_multiple_blocks_with_barrier_wait + ; GFX12: bb.0: + ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.1: + ; GFX12-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec + ; GFX12-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.2: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc + ; GFX12-NEXT: S_BRANCH %bb.5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.3: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc + ; GFX12-NEXT: S_BRANCH %bb.5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.4: + ; GFX12-NEXT: successors: %bb.5(0x80000000) + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: bb.5: + ; GFX12-NEXT: S_BARRIER_WAIT -1 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + ; GFX12-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + bb.0: + S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc + V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.4, implicit $exec + + bb.1: + V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr1, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.3, implicit $exec + + bb.2: + S_BARRIER_SIGNAL_ISFIRST_IMM 0, implicit-def $scc, implicit killed $scc + S_BRANCH %bb.5 + + bb.3: + S_BARRIER_SIGNAL_ISFIRST_IMM 1, implicit-def $scc, implicit killed $scc + S_BRANCH %bb.5 + + bb.4: + S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit killed $scc + + bb.5: + S_BARRIER_WAIT -1 + renamable $sgpr1 = S_CSELECT_B32 10, 20, implicit killed $scc + $vgpr5 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + GLOBAL_STORE_DWORD $vgpr6_vgpr7, $vgpr5, 0, 0, implicit $exec + S_ENDPGM 0 +...