Skip to content

Commit

Permalink
[AMDGPU] WQM: Allow insertion of exact mode transition as terminator
Browse files Browse the repository at this point in the history
Allow WQM pass to insert transitions to exact mode among block
terminators, instead of forcing them to occur before terminators.

This should not yield any functional change, but allows block
splitting of control flow, such as that in D145329.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D151797
  • Loading branch information
perlfu committed Jun 2, 2023
1 parent 3e83426 commit 2e87ed8
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 8 deletions.
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -1953,6 +1953,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_AND_B32));
break;

case AMDGPU::S_AND_SAVEEXEC_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
break;

case AMDGPU::S_AND_SAVEEXEC_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
break;

case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Expand Down Expand Up @@ -2730,11 +2742,13 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
case AMDGPU::S_AND_SAVEEXEC_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
case AMDGPU::S_AND_SAVEEXEC_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -306,6 +306,7 @@ def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
}

let WaveSizePredicate = isWave32 in {
Expand All @@ -314,6 +315,7 @@ def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>;
}


Expand Down
28 changes: 20 additions & 8 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Expand Up @@ -158,10 +158,11 @@ class SIWholeQuadMode : public MachineFunctionPass {
MachinePostDominatorTree *PDT;

unsigned AndOpc;
unsigned AndTermOpc;
unsigned AndN2Opc;
unsigned XorOpc;
unsigned AndSaveExecOpc;
unsigned OrSaveExecOpc;
unsigned AndSaveExecTermOpc;
unsigned WQMOpc;
Register Exec;
Register LiveMaskReg;
Expand Down Expand Up @@ -1206,13 +1207,25 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SaveWQM) {
bool IsTerminator = Before == MBB.end();
if (!IsTerminator) {
auto FirstTerm = MBB.getFirstTerminator();
if (FirstTerm != MBB.end()) {
SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
IsTerminator = BeforeIdx > FirstTermIdx;
}
}

MachineInstr *MI;

if (SaveWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
.addReg(LiveMaskReg);
} else {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
Expand Down Expand Up @@ -1365,9 +1378,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
Needs = StateExact | StateWQM | StateStrict;
}

if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;

++Next;
} else {
// End of basic block
Expand Down Expand Up @@ -1591,18 +1601,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {

if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
AndTermOpc = AMDGPU::S_AND_B32_term;
AndN2Opc = AMDGPU::S_ANDN2_B32;
XorOpc = AMDGPU::S_XOR_B32;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
AndTermOpc = AMDGPU::S_AND_B64_term;
AndN2Opc = AMDGPU::S_ANDN2_B64;
XorOpc = AMDGPU::S_XOR_B64;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
Expand Down

0 comments on commit 2e87ed8

Please sign in to comment.