Skip to content

Commit

Permalink
[AMDGPU] Suppress redundant waitcnt instrs.
Browse files Browse the repository at this point in the history
1. Run the memory legalizer prior to the waitcnt pass; keep the policy that the waitcnt pass does not remove any waitcnts within the incoming IR.

2. The waitcnt pass doesn't (yet) track waitcnts that exist prior to the waitcnt pass (it just skips over them); because the waitcnt pass is ignorant of them, it may insert a redundant waitcnt. To avoid this, check the prev instr. If it and the to-be-inserted waitcnt are the same, suppress the insertion. We keep the existing waitcnt under the assumption that whomever, e.g., the memory legalizer, inserted it knows what they were doing.

3. Follow-on work: teach the waitcnt pass to record the pre-existing waitcnts for better waitcnt production.

Differential Revision: https://reviews.llvm.org/D42854

llvm-svn: 324440
  • Loading branch information
searlmc1 committed Feb 7, 2018
1 parent 5834052 commit 24c92ee
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 19 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -874,13 +874,13 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);

addPass(createSIMemoryLegalizerPass());
if (EnableSIInsertWaitcntsPass)
addPass(createSIInsertWaitcntsPass());
else
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
Expand Down
56 changes: 38 additions & 18 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Expand Up @@ -361,7 +361,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
AMDGPUAS AMDGPUASI;

DenseSet<MachineBasicBlock *> BlockVisitedSet;
DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseSet<MachineInstr *> VCCZBugHandledSet;

DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
Expand Down Expand Up @@ -1114,7 +1114,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
BlockWaitcntBrackets *ScoreBracket =
BlockWaitcntBracketsMap[TBB].get();
if (!ScoreBracket) {
assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
assert(!BlockVisitedSet.count(TBB));
BlockWaitcntBracketsMap[TBB] =
llvm::make_unique<BlockWaitcntBrackets>();
ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Expand All @@ -1132,7 +1132,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
} else {
SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
MI.getDebugLoc());
CompilerGeneratedWaitcntSet.insert(SWaitInst);
TrackedWaitcntSet.insert(SWaitInst);
}

const MachineOperand &Op =
Expand Down Expand Up @@ -1267,7 +1267,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
for (MachineBasicBlock *pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[pred].get();
bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
bool Visited = BlockVisitedSet.count(pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
Expand Down Expand Up @@ -1306,7 +1306,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
for (MachineBasicBlock *Pred : Block.predecessors()) {
BlockWaitcntBrackets *PredScoreBrackets =
BlockWaitcntBracketsMap[Pred].get();
bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
bool Visited = BlockVisitedSet.count(Pred);
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
continue;
}
Expand Down Expand Up @@ -1354,7 +1354,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {

// Set the register scoreboard.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
if (!BlockVisitedSet.count(Pred)) {
continue;
}

Expand Down Expand Up @@ -1468,7 +1468,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
// sequencing predecessors, because changes to EXEC require waitcnts due to
// the delayed nature of these operations.
for (MachineBasicBlock *Pred : Block.predecessors()) {
if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
if (!BlockVisitedSet.count(Pred)) {
continue;
}

Expand Down Expand Up @@ -1530,8 +1530,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
// TODO: Register the old waitcnt and optimize the following waitcnts.
// Leaving the previously existing waitcnts is conservatively correct.
if (CompilerGeneratedWaitcntSet.find(&Inst) ==
CompilerGeneratedWaitcntSet.end())
if (!TrackedWaitcntSet.count(&Inst))
++Iter;
else {
ScoreBrackets->setWaitcnt(&Inst);
Expand All @@ -1550,7 +1549,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

bool VCCZBugWorkAround = false;
if (readsVCCZ(Inst) &&
(VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
(!VCCZBugHandledSet.count(&Inst))) {
if (ScoreBrackets->getScoreLB(LGKM_CNT) <
ScoreBrackets->getScoreUB(LGKM_CNT) &&
ScoreBrackets->hasPendingSMEM()) {
Expand All @@ -1564,11 +1563,29 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);

if (SWaitInst) {
Block.insert(Inst, SWaitInst);
if (ScoreBrackets->getWaitcnt() != SWaitInst) {
DEBUG(dbgs() << "insertWaitcntInBlock\n"
<< "Old Instr: " << Inst << '\n'
<< "New Instr: " << *SWaitInst << '\n';);
// We don't (yet) track waitcnts that existed prior to the waitcnt
// pass (we just skip over them); because the waitcnt pass is ignorant
// of them, it may insert a redundant waitcnt. To avoid this, check
// the prev instr. If it and the to-be-inserted waitcnt are the
// same, keep the prev waitcnt and skip the insertion. We assume that
// whomever. e.g., for memory model, inserted the prev waitcnt really
// wants it there.
bool insertSWaitInst = true;
if (Iter != Block.begin()) {
MachineInstr *MIPrevInst = &*std::prev(Iter);
if (MIPrevInst &&
MIPrevInst->getOpcode() == AMDGPU::S_WAITCNT &&
MIPrevInst->getOperand(0).getImm() == SWaitInst->getOperand(0).getImm()) {
insertSWaitInst = false;
}
}
if (insertSWaitInst) {
Block.insert(Inst, SWaitInst);
if (ScoreBrackets->getWaitcnt() != SWaitInst) {
DEBUG(dbgs() << "insertWaitcntInBlock\n"
<< "Old Instr: " << Inst << '\n'
<< "New Instr: " << *SWaitInst << '\n';);
}
}
}

Expand Down Expand Up @@ -1656,7 +1673,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (!SWaitInst) {
SWaitInst = Block.getParent()->CreateMachineInstr(
TII->get(AMDGPU::S_WAITCNT), DebugLoc());
CompilerGeneratedWaitcntSet.insert(SWaitInst);
TrackedWaitcntSet.insert(SWaitInst);
const MachineOperand &Op = MachineOperand::CreateImm(0);
SWaitInst->addOperand(MF, Op);
#if 0 // TODO: Format the debug output
Expand Down Expand Up @@ -1712,6 +1729,10 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
RegisterEncoding.SGPRL =
RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;

TrackedWaitcntSet.clear();
BlockVisitedSet.clear();
VCCZBugHandledSet.clear();

// Walk over the blocks in reverse post-dominator order, inserting
// s_waitcnt where needed.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
Expand All @@ -1738,8 +1759,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// at least 1 re-walk over the loop to propagate the information, even if
// no S_WAITCNT instructions were generated.
if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
(BlockWaitcntProcessedSet.find(&MBB) ==
BlockWaitcntProcessedSet.end())) {
(!BlockWaitcntProcessedSet.count(&MBB))) {
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
DEBUG(dbgs() << "set-revisit: block"
<< ContainingLoop->getHeader()->getNumber() << '\n';);
Expand Down
24 changes: 24 additions & 0 deletions llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
@@ -0,0 +1,24 @@
# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s

# Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
# or S_WAITCNT 3952

...
# CHECK-LABEL: name: waitcnt-no-redundant
# CHECK: DS_READ_B64
# CHECK-NEXT: S_WAITCNT 127
# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
# CHECK-NEXT: S_WAITCNT 3952
# CHECK-NEXT: BUFFER_WBINVL1_VOL

name: waitcnt-no-redundant
body: |
bb.0:
renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
S_WAITCNT 127
FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 3952
BUFFER_WBINVL1_VOL implicit $exec
S_ENDPGM
...

0 comments on commit 24c92ee

Please sign in to comment.