Skip to content

Commit

Permalink
[AMDGPU] refactor WQM pass in preparation for WWM (NFCI)
Browse files Browse the repository at this point in the history
Summary:
Right now, the WQM pass conflates two different things when tracking the
Needs of an instruction:

1. Needs can be StateWQM, which is propagated to other instructions, and
means that this instruction (and everything it depends on) must be
calculated in WQM.
2. Needs can be StateExact, which is not propagated to other
instructions, and means that this instruction must not be calculated in
WQM and WQM-ness must not be propagated past this instruction.

This works now because there are only two different states, but in the
future we want to be able to express things like "calculate this in WQM,
but please disable WWM and don't propagate it" (to implement
@llvm.amdgcn.set.inactive). In order to do this, we need to split the
per-instruction Needs field in two: a new Needs field, which can only
contain StateWQM (and in the future, StateWWM) and is propagated to
sources, and a Disables field, which can also contain just StateWQM or
nothing for now.

We keep the per-block tracking the same for now, by translating
Needs/Disables to the old representation with only StateWQM or
StateExact. The other place that needs special handling is when we
emit the state transitions. We could just translate back to the old
representation there as well, which we almost do, but instead of 0 as a
placeholder value for "any state," we explicitly or together all the
states an instruction is allowed to be in. This lets us refactor the
code in preparation for WWM, where we'll need to be able to handle
things like "this instruction must be in Exact or WQM, but not WWM."

Reviewers: arsenm, nhaehnle, tpr

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D35523

llvm-svn: 310086
  • Loading branch information
cwabbott0 committed Aug 4, 2017
1 parent 8c217d0 commit de068fe
Showing 1 changed file with 77 additions and 46 deletions.
123 changes: 77 additions & 46 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Expand Up @@ -54,6 +54,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveInterval.h"
Expand Down Expand Up @@ -108,6 +109,7 @@ static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {

struct InstrInfo {
char Needs = 0;
char Disabled = 0;
char OutNeeds = 0;
};

Expand Down Expand Up @@ -142,7 +144,8 @@ class SIWholeQuadMode : public MachineFunctionPass {

void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
Expand Down Expand Up @@ -220,22 +223,27 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];

assert(Flag == StateWQM || Flag == StateExact);
assert(Flag == StateWQM);

// Ignore if the instruction is already marked. The typical case is that we
// mark an instruction WQM multiple times, but for atomics it can happen that
// Flag is StateWQM, but Needs is already set to StateExact. In this case,
// letting the atomic run in StateExact is correct as per the relevant specs.
if (II.Needs)
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
// ignoring the request for WQM is correct as per the relevant specs.
Flag &= ~II.Disabled;

// Ignore if the flag is already encompassed by the existing needs, or we
// just disabled everything.
if ((II.Needs & Flag) == Flag)
return;

II.Needs = Flag;
II.Needs |= Flag;
Worklist.push_back(&MI);
}

/// Mark all instructions defining the uses in \p MI as WQM.
void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
std::vector<WorkItem> &Worklist) {
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
assert(Flag == StateWQM);
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
Expand All @@ -260,15 +268,15 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
if (Value->isPHIDef())
continue;

markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
Worklist);
}

continue;
}

for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
markInstruction(DefMI, StateWQM, Worklist);
markInstruction(DefMI, Flag, Worklist);
}
}

Expand All @@ -279,11 +287,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");

for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
// We need to visit the basic blocks in reverse post-order so that we visit
// defs before uses, in particular so that we don't accidentally mark an
// instruction as needing e.g. WQM before visiting it and realizing it needs
// WQM disabled.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
MachineBasicBlock &MBB = **BI;
BlockInfo &BBI = Blocks[&MBB];

for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;
InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();
char Flags = 0;

Expand All @@ -293,7 +308,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
markUsesWQM(MI, Worklist);
markInstructionUses(MI, StateWQM, Worklist);
GlobalFlags |= StateWQM;
continue;
} else if (Opcode == AMDGPU::WQM) {
Expand All @@ -302,7 +317,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);
} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
BBI.InNeeds |= StateExact;
Worklist.push_back(&MBB);
}
GlobalFlags |= StateExact;
III.Disabled = StateWQM;
continue;
} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
Expand Down Expand Up @@ -344,17 +366,19 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,

// Control flow-type instructions and stores to temporary memory that are
// followed by WQM computations must themselves be in WQM.
if ((II.OutNeeds & StateWQM) && !II.Needs &&
if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
(MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}

// Propagate to block level
BI.Needs |= II.Needs;
if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
BI.InNeeds |= II.Needs;
Worklist.push_back(MBB);
if (II.Needs & StateWQM) {
BI.Needs |= StateWQM;
if (!(BI.InNeeds & StateWQM)) {
BI.InNeeds |= StateWQM;
Worklist.push_back(MBB);
}
}

// Propagate backwards within block
Expand All @@ -370,10 +394,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
}

// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM | StateExact));
assert(!(II.Needs & StateExact));

if (II.Needs == StateWQM)
markUsesWQM(MI, Worklist);
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
}

void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
Expand Down Expand Up @@ -594,7 +618,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
MachineBasicBlock::iterator First = IE;
for (;;) {
MachineBasicBlock::iterator Next = II;
char Needs = 0;
char Needs = StateExact | StateWQM;
char OutNeeds = 0;

if (First == IE)
Expand All @@ -606,12 +630,15 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (requiresCorrectState(MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
Needs = III->second.Needs;
if (III->second.Needs & StateWQM)
Needs = StateWQM;
else
Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;
}
}

if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;

if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
Expand All @@ -624,36 +651,40 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
Needs = StateWQM;
else if (BI.OutNeeds == StateExact)
Needs = StateExact;
else
Needs = StateWQM | StateExact;
}

if (Needs) {
if (Needs != State) {
MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs == StateExact || WQMFromExec);
if (!(Needs & State)) {
MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs == StateExact || WQMFromExec);

if (Needs == StateExact) {
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
if (Needs == StateExact) {
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
} else {
assert(WQMFromExec == (SavedWQMReg == 0));
toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
State = StateExact;
} else {
assert(Needs == StateWQM);
assert(WQMFromExec == (SavedWQMReg == 0));

toWQM(MBB, Before, SavedWQMReg);
toWQM(MBB, Before, SavedWQMReg);

if (SavedWQMReg) {
LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;
}
if (SavedWQMReg) {
LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;
}

State = Needs;
State = StateWQM;
}

First = IE;
}

if (Needs != (StateExact | StateWQM))
First = IE;

if (II == IE)
break;
II = Next;
Expand Down

0 comments on commit de068fe

Please sign in to comment.