Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 180 additions & 29 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);

static cl::opt<bool> ExpandWaitcntProfiling(
"amdgpu-expand-waitcnt-profiling",
cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false),
cl::Hidden);

namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
Expand Down Expand Up @@ -297,6 +302,34 @@ class WaitcntGenerator {
// optimization.
bool isOptNone() const { return OptNone; }

// Get the maximum wait count value for a given counter type.
// For pre-GFX12, LOAD_CNT uses vmcnt and DS_CNT uses lgkmcnt.
// For GFX12+, LOAD_CNT uses loadcnt and DS_CNT uses dscnt.
unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
case LOAD_CNT:
return ST->hasExtendedWaitCounts() ? AMDGPU::getLoadcntBitMask(IV)
: AMDGPU::getVmcntBitMask(IV);
case DS_CNT:
return ST->hasExtendedWaitCounts() ? AMDGPU::getDscntBitMask(IV)
: AMDGPU::getLgkmcntBitMask(IV);
case EXP_CNT:
return AMDGPU::getExpcntBitMask(IV);
case STORE_CNT:
return AMDGPU::getStorecntBitMask(IV);
case SAMPLE_CNT:
return AMDGPU::getSamplecntBitMask(IV);
case BVH_CNT:
return AMDGPU::getBvhcntBitMask(IV);
case KM_CNT:
return AMDGPU::getKmcntBitMask(IV);
case X_CNT:
return AMDGPU::getXcntBitMask(IV);
default:
return 0;
}
}

// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
// any new wait count instructions which may need to be generated by
Expand All @@ -318,9 +351,11 @@ class WaitcntGenerator {

// Generates new wait count instructions according to the value of
// Wait, returning true if any new instructions were created.
// If ScoreBrackets is provided, it can be used for profiling expansion.
virtual bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
AMDGPU::Waitcnt Wait) = 0;
AMDGPU::Waitcnt Wait,
WaitcntBrackets *ScoreBrackets = nullptr) = 0;

// Returns an array of bit masks which can be used to map values in
// WaitEventType to corresponding counter values in InstCounterType.
Expand Down Expand Up @@ -356,7 +391,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {

bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
AMDGPU::Waitcnt Wait) override;
AMDGPU::Waitcnt Wait,
WaitcntBrackets *ScoreBrackets = nullptr) override;

const unsigned *getWaitEventMask() const override {
assert(ST);
Expand Down Expand Up @@ -393,7 +429,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {

bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
AMDGPU::Waitcnt Wait) override;
AMDGPU::Waitcnt Wait,
WaitcntBrackets *ScoreBrackets = nullptr) override;

const unsigned *getWaitEventMask() const override {
assert(ST);
Expand Down Expand Up @@ -1523,38 +1560,108 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
AMDGPU::Waitcnt Wait) {
AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(isNormalMode(MaxCounter));

bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);

// Helper to emit expanded waitcnt sequence for profiling.
// Emits waitcnts from (Outstanding-1) down to Target, or just Target if
// nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
auto EmitWaitcnt) {
if (Outstanding > Target) {
for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
EmitWaitcnt(i);
Modified = true;
}
} else {
EmitWaitcnt(Target);
Modified = true;
}
};

// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
// single instruction while VScnt has its own instruction.
if (Wait.hasWaitExceptStoreCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
[[maybe_unused]] auto SWaitInst =
// If profiling expansion is enabled and we have score brackets,
// emit an expanded sequence
if (ExpandWaitcntProfiling && ScoreBrackets) {
// Check if any of the counters to be waited on are out-of-order.
// If so, fall back to normal (non-expanded) behavior since expansion
// would provide misleading profiling information.
bool AnyOutOfOrder = false;
for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
unsigned &WaitCnt = getCounterRef(Wait, CT);
if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
AnyOutOfOrder = true;
break;
}
}

if (AnyOutOfOrder) {
// Fall back to non-expanded wait
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
Modified = true;
Modified = true;
} else {
// All counters are in-order, safe to expand
for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
unsigned &WaitCnt = getCounterRef(Wait, CT);
if (WaitCnt == ~0u)
continue;

LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
ScoreBrackets->getScoreLB(CT),
getWaitCountMax(CT) - 1);
emitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
AMDGPU::Waitcnt W;
getCounterRef(W, CT) = Count;
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
.addImm(AMDGPU::encodeWaitcnt(IV, W));
});
}
}
} else {
// Normal behavior: emit single combined waitcnt
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
[[maybe_unused]] auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
Modified = true;

LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
}

if (Wait.hasWaitStoreCnt()) {
assert(ST->hasVscnt());

[[maybe_unused]] auto SWaitInst =
if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
!ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
// Only expand if counter is not out-of-order
unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
ScoreBrackets->getScoreLB(STORE_CNT),
getWaitCountMax(STORE_CNT) - 1);
emitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.StoreCnt);
Modified = true;
.addImm(Count);
});
} else {
[[maybe_unused]] auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.StoreCnt);
Modified = true;

LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
dbgs() << "New Instr: " << *SWaitInst << '\n');
}
}

return Modified;
Expand Down Expand Up @@ -1777,13 +1884,55 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
AMDGPU::Waitcnt Wait) {
AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
assert(ST);
assert(!isNormalMode(MaxCounter));

bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);

// Helper to emit expanded waitcnt sequence for profiling.
auto emitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
auto EmitWaitcnt) {
if (Outstanding > Target) {
for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
EmitWaitcnt(i);
Modified = true;
}
} else {
EmitWaitcnt(Target);
Modified = true;
}
};

// For GFX12+, we use separate wait instructions, which makes expansion
// simpler
if (ExpandWaitcntProfiling && ScoreBrackets) {
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
unsigned Count = getWait(Wait, CT);
if (Count == ~0u)
continue;

// Skip expansion for out-of-order counters - emit normal wait instead
if (ScoreBrackets->counterOutOfOrder(CT)) {
BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(Count);
Modified = true;
continue;
}

unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
ScoreBrackets->getScoreLB(CT),
getWaitCountMax(CT) - 1);
emitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
.addImm(Val);
});
}
return Modified;
}

// Normal behavior (no expansion)
// Check for opportunities to use combined wait instructions.
if (Wait.DsCnt != ~0u) {
MachineInstr *SWaitInst = nullptr;
Expand Down Expand Up @@ -2141,9 +2290,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
Modified =
WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);

// Any counts that could have been applied to any existing waitcnt
// instructions will have been done so, now deal with any remaining.
ScoreBrackets.applyWaitcnt(Wait);
AMDGPU::Waitcnt WaitForScore = Wait;

// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
Expand All @@ -2161,23 +2308,27 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}

// XCnt may be already consumed by a load wait.
// When we wait for KmCnt==0, all SMEM operations (including address
// translations) are complete, so XCNT wait is redundant. When we wait for
// LoadCnt==0 and XCnt==0, the LoadCnt wait already ensures all address
// translations are complete (since XCnt follows LoadCnt for loads). When the
// current instruction is a VMEM access, translations are in-order.
if (Wait.XCnt != ~0u) {
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
if (Wait.KmCnt == 0)
Wait.XCnt = ~0u;

if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
else if (Wait.LoadCnt == 0 && Wait.XCnt == 0)
Wait.XCnt = ~0u;

// Since the translation for VMEM addresses occur in-order, we can skip the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
if (isVmemAccess(*It))
else if (isVmemAccess(*It))
Wait.XCnt = ~0u;
}

if (WCG->createNewWaitcnt(Block, It, Wait))
if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
Modified = true;

// Any counts that could have been applied to any existing waitcnt
// instructions will have been done so, now deal with any remaining.
ScoreBrackets.applyWaitcnt(WaitForScore);

return Modified;
}

Expand Down
Loading