Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 75 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ struct HardwareLimits {
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
DECL(SCC_WRITE) /* write to SCC from barrier */ \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...Is this the same SCC as the condition register?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
Expand Down Expand Up @@ -149,6 +150,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
Expand All @@ -163,6 +165,9 @@ enum RegisterMapping {
FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
NUM_LDS_VGPRS = 9, // One more than the stores we track.
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
// Remaining non-allocatable registers
SCC = NUM_ALL_ALLOCATABLE
};

// Enumerate different types of result-returning VMEM operations. Although
Expand Down Expand Up @@ -401,7 +406,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
eventMask({SMEM_ACCESS, SQ_MESSAGE}),
eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
eventMask({VMEM_GROUP, SMEM_GROUP})};

return WaitEventMaskForInstGFX12Plus;
Expand Down Expand Up @@ -586,6 +591,7 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
static bool asynchronouslyWritesSCC(unsigned Opcode);
};

// This objects maintains the current score brackets of each wait counter, and
Expand Down Expand Up @@ -626,7 +632,12 @@ class WaitcntBrackets {
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS)
return VgprScores[T][GprNo];
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];

if (GprNo < NUM_ALL_ALLOCATABLE)
return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];

assert(GprNo == SCC);
return SCCScore;
}

bool merge(const WaitcntBrackets &Other);
Expand All @@ -646,6 +657,7 @@ class WaitcntBrackets {
AMDGPU::Waitcnt &Wait) const {
determineWait(T, {RegNo, RegNo + 1}, Wait);
}
void tryClearSCCWriteEvent(MachineInstr *Inst);

void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
Expand Down Expand Up @@ -785,6 +797,10 @@ class WaitcntBrackets {
// Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
// X_CNT score.
unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
// Reg score for SCC.
unsigned SCCScore = 0;
// The unique instruction that has an SCC write pending, if there is one.
const MachineInstr *PendingSCCWrite = nullptr;
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
Expand Down Expand Up @@ -820,6 +836,9 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
const MachineOperand &Op) const {
if (Op.getReg() == AMDGPU::SCC)
return {SCC, SCC + 1};

if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};

Expand Down Expand Up @@ -864,9 +883,12 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
if (RegNo < NUM_ALL_VGPRS) {
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
} else {
} else if (RegNo < NUM_ALL_ALLOCATABLE) {
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
} else {
assert(RegNo == SCC);
SCCScore = Score;
}
}
}
Expand Down Expand Up @@ -1077,6 +1099,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (Slot)
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}

if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
setRegScore(SCC, T, CurrScore);
PendingSCCWrite = &Inst;
}
}
}

Expand Down Expand Up @@ -1145,6 +1172,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":s" << J << " ";
}
}
if (T == KM_CNT && SCCScore > 0)
OS << SCCScore << ":scc ";
}
OS << '\n';
}
Expand Down Expand Up @@ -1219,6 +1248,24 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
}
}

void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
// S_BARRIER_WAIT on the same barrier guarantees that the pending write to
// SCC has landed
if (PendingSCCWrite &&
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
SCC_WRITE_PendingEvent) {
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
}

PendingEvents &= ~SCC_WRITE_PendingEvent;
PendingSCCWrite = nullptr;
}
}

void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
Expand Down Expand Up @@ -1908,6 +1955,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait);
}
}
} else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
Expand Down Expand Up @@ -1997,6 +2046,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
} else if (Op.getReg() == AMDGPU::SCC) {
ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
Expand Down Expand Up @@ -2334,6 +2385,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
} else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
Expand All @@ -2344,9 +2397,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
Expand Down Expand Up @@ -2413,6 +2463,19 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == DS_CNT)
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);

if (T == KM_CNT) {
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
if (Other.hasPendingEvent(SCC_WRITE)) {
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
if (!OldEventsHasSCCWrite) {
PendingSCCWrite = Other.PendingSCCWrite;
} else {
if (PendingSCCWrite != Other.PendingSCCWrite)
PendingSCCWrite = nullptr;
}
}
}

for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);

Expand Down Expand Up @@ -2444,6 +2507,12 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}

bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
}

// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
Expand Down
75 changes: 74 additions & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
Expand All @@ -27,13 +27,86 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
ret i1 %r
}

define i1 @signal_isfirst_same_barrier_wait() {
; GFX12-SDAG-LABEL: signal_isfirst_same_barrier_wait:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_wait -1
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: signal_isfirst_same_barrier_wait:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_barrier_wait -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 -1)
ret i1 %r
}

define i1 @signal_isfirst_different_barrier_wait() {
; GFX12-SDAG-LABEL: signal_isfirst_different_barrier_wait:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_wait 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: signal_isfirst_different_barrier_wait:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_barrier_wait 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 0)
ret i1 %r
}

declare void @llvm.amdgcn.s.barrier.wait(i16)
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
Loading