Skip to content

Commit

Permalink
AMDGPU/InsertWaitcnts: Simplify pending events tracking
Browse files Browse the repository at this point in the history
Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.

This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).

This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.

Reviewers: msearles, rampitec, scott.linder, kanarayan

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam

Differential Revision: https://reviews.llvm.org/D54228

llvm-svn: 347850
  • Loading branch information
nhaehnle committed Nov 29, 2018
1 parent ae369d7 commit d1f45da
Showing 1 changed file with 59 additions and 191 deletions.
250 changes: 59 additions & 191 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Expand Up @@ -13,6 +13,14 @@
/// Memory reads and writes are issued asynchronously, so we need to insert
/// S_WAITCNT instructions when we want to access any of their results or
/// overwrite any register that's used asynchronously.
///
/// TODO: This pass currently keeps one timeline per hardware counter. A more
/// finely-grained approach that keeps one timeline per event type could
/// sometimes get away with generating weaker s_waitcnt instructions. For
/// example, when both SMEM and LDS are in flight and we need to wait for
/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
/// but the pass will currently generate a conservative lgkmcnt(0) because
/// multiple event types are in flight.
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -132,10 +140,13 @@ enum WaitEventType {
NUM_WAIT_EVENTS,
};

iterator_range<enum_iterator<WaitEventType>> wait_event_types() {
return make_range(enum_iterator<WaitEventType>(VMEM_ACCESS),
enum_iterator<WaitEventType>(NUM_WAIT_EVENTS));
}
static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
(1 << VMEM_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
};

// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
Expand Down Expand Up @@ -232,24 +243,12 @@ class BlockWaitcntBrackets {

// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
switch (E) {
case VMEM_ACCESS:
if (E == VMEM_ACCESS)
return VM_CNT;
case LDS_ACCESS:
case GDS_ACCESS:
case SQ_MESSAGE:
case SMEM_ACCESS:
if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
return LGKM_CNT;
case EXP_GPR_LOCK:
case GDS_GPR_LOCK:
case VMW_GPR_LOCK:
case EXP_POS_ACCESS:
case EXP_PARAM_ACCESS:
return EXP_CNT;
default:
llvm_unreachable("unhandled event type");
}
return NUM_INST_CNTS;
assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
return EXP_CNT;
}

void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
Expand Down Expand Up @@ -278,7 +277,8 @@ class BlockWaitcntBrackets {
void clear() {
memset(ScoreLBs, 0, sizeof(ScoreLBs));
memset(ScoreUBs, 0, sizeof(ScoreUBs));
memset(EventUBs, 0, sizeof(EventUBs));
PendingEvents = 0;
memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
for (auto T : inst_counter_types())
memset(VgprScores[T], 0, sizeof(VgprScores[T]));
memset(SgprScores, 0, sizeof(SgprScores));
Expand All @@ -296,15 +296,9 @@ class BlockWaitcntBrackets {
void setWaitAtBeginning() { WaitAtBeginning = true; }
void clearWaitAtBeginning() { WaitAtBeginning = false; }
bool getWaitAtBeginning() const { return WaitAtBeginning; }
void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
int32_t getMaxVGPR() const { return VgprUB; }
int32_t getMaxSGPR() const { return SgprUB; }

int32_t getEventUB(enum WaitEventType W) const {
assert(W < NUM_WAIT_EVENTS);
return EventUBs[W];
}

bool counterOutOfOrder(InstCounterType T) const;
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
Expand All @@ -316,11 +310,12 @@ class BlockWaitcntBrackets {
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);

bool hasPendingSMEM() const {
return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
bool hasPendingEvent(WaitEventType E) const {
return PendingEvents & (1 << E);
}

void mergePendingEvents(const BlockWaitcntBrackets &Other);

bool hasPendingFlat() const {
return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
Expand All @@ -343,23 +338,18 @@ class BlockWaitcntBrackets {
void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
int32_t getPostOrder() const { return PostOrder; }

bool mixedExpTypes() const { return MixedExpTypes; }
void setMixedExpTypes(bool MixedExpTypesIn) {
MixedExpTypes = MixedExpTypesIn;
}

void print(raw_ostream &);
void dump() { print(dbgs()); }

private:
const GCNSubtarget *ST = nullptr;
bool WaitAtBeginning = false;
bool RevisitLoop = false;
bool MixedExpTypes = false;
int32_t PostOrder = 0;
int32_t ScoreLBs[NUM_INST_CNTS] = {0};
int32_t ScoreUBs[NUM_INST_CNTS] = {0};
int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
uint32_t PendingEvents = 0;
bool MixedPendingEvents[NUM_INST_CNTS] = {false};
// Remember the last flat memory operation.
int32_t LastFlat[NUM_INST_CNTS] = {0};
// wait_cnt scores for every vgpr.
Expand Down Expand Up @@ -559,19 +549,17 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const MachineRegisterInfo &MRIA = *MRI;
InstCounterType T = eventCounter(E);
int32_t CurrScore = getScoreUB(T) + 1;
// EventUB and ScoreUB need to be update regardless if this event changes
// the score of a register or not.
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
EventUBs[E] = CurrScore;
if (!hasPendingEvent(E)) {
if (PendingEvents & WaitEventMaskForInst[T])
MixedPendingEvents[T] = true;
PendingEvents |= 1 << E;
}
setScoreUB(T, CurrScore);

if (T == EXP_CNT) {
// Check for mixed export types. If they are mixed, then a waitcnt exp(0)
// is required.
if (!MixedExpTypes) {
MixedExpTypes = counterOutOfOrder(EXP_CNT);
}

// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
Expand Down Expand Up @@ -803,9 +791,6 @@ void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(VM_CNT, Wait.VmCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);

if (Wait.ExpCnt == 0)
setMixedExpTypes(false);
}

void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
Expand All @@ -818,76 +803,28 @@ void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
} else {
setScoreLB(T, UB);
MixedPendingEvents[T] = false;
PendingEvents &= ~WaitEventMaskForInst[T];
}
}

void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
for (auto T : inst_counter_types()) {
uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
MixedPendingEvents[T] = true;
PendingEvents |= New;
}
}

// Where there are multiple types of event in the bracket of a counter,
// the decrement may go out of order.
bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
switch (T) {
case VM_CNT:
return false;
case LGKM_CNT: {
if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
// Scalar memory read always can go out of order.
return true;
}
int NumEventTypes = 0;
if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
NumEventTypes++;
}
if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
NumEventTypes++;
}
if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
NumEventTypes++;
}
if (NumEventTypes <= 1) {
return false;
}
break;
}
case EXP_CNT: {
// If there has been a mixture of export types, then a waitcnt exp(0) is
// required.
if (MixedExpTypes)
return true;
int NumEventTypes = 0;
if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
NumEventTypes++;
}
if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
NumEventTypes++;
}
if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
NumEventTypes++;
}
if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
NumEventTypes++;
}

if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
NumEventTypes++;
}

if (NumEventTypes <= 1) {
return false;
}
break;
}
default:
break;
}
return true;
// Scalar memory read always can go out of order.
if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
return true;
return MixedPendingEvents[T];
}

INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
Expand Down Expand Up @@ -1023,14 +960,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
// Export and GDS are tracked individually, either may trigger a waitcnt
// for EXEC.
ScoreBrackets->determineWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);
ScoreBrackets->determineWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);
ScoreBrackets->determineWait(
EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);
ScoreBrackets->determineWait(
EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);
if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
Wait.ExpCnt = 0;
}
}

#if 0 // TODO: the following code to handle CALL.
Expand Down Expand Up @@ -1135,7 +1070,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Wait.LgkmCnt = 0;
}
}
Expand Down Expand Up @@ -1315,7 +1250,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
int32_t MaxPending[NUM_INST_CNTS] = {0};
int32_t MaxFlat[NUM_INST_CNTS] = {0};
bool MixedExpTypes = false;

// For single basic block loops, we need to retain the Block's
// score bracket to have accurate Pred info. So, make a copy of Block's
Expand Down Expand Up @@ -1351,25 +1285,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
MaxFlat[T] = std::max(MaxFlat[T], span);
}

MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
}

// Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[Pred].get();
bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}

int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
PredScoreBrackets->getScoreLB(EXP_CNT);
MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
PredScoreBrackets->getScoreLB(EXP_CNT);
MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
}

#if 0
Expand All @@ -1390,8 +1305,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
ScoreBrackets->setLastFlat(T, MaxFlat[T]);
}

ScoreBrackets->setMixedExpTypes(MixedExpTypes);

// Set the register scoreboard.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (!BlockVisitedSet.count(Pred)) {
Expand Down Expand Up @@ -1434,52 +1347,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
}
}

// Also merge the WaitEvent information.
for (auto W : wait_event_types()) {
enum InstCounterType T = PredScoreBrackets->eventCounter(W);
int PredEventUB = PredScoreBrackets->getEventUB(W);
if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
int NewEventUB =
MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
if (NewEventUB > 0) {
ScoreBrackets->setEventUB(
W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
}
}
}
}

// Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
// sequencing predecessors, because changes to EXEC require waitcnts due to
// the delayed nature of these operations.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (!BlockVisitedSet.count(Pred)) {
continue;
}

BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[Pred].get();

int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
PredScoreBrackets->getScoreUB(EXP_CNT);
if (new_gds_ub > 0) {
ScoreBrackets->setEventUB(
GDS_GPR_LOCK,
std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
}
}
int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
PredScoreBrackets->getScoreUB(EXP_CNT);
if (new_exp_ub > 0) {
ScoreBrackets->setEventUB(
EXP_GPR_LOCK,
std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
}
}
ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
}

// if a single block loop, update the score brackets. Not needed for other
Expand Down Expand Up @@ -1562,7 +1430,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
(!VCCZBugHandledSet.count(&Inst))) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
VCCZBugWorkAround = true;
}
Expand Down

0 comments on commit d1f45da

Please sign in to comment.