Skip to content

Commit

Permalink
[AMDGPU] Optimize S_CBRANCH_VCC[N]Z -> S_CBRANCH_EXEC[N]Z
Browse files Browse the repository at this point in the history
Sometimes after basic block placement we end up with a code like:

  sreg = s_mov_b64 -1
  vcc = s_and_b64 exec, sreg
  s_cbranch_vccz

This happens as a join of a block assigning -1 to a saved mask and
another block which consumes that saved mask with s_and_b64 and a
branch.

This is essentially a single s_cbranch_execz instruction when moved
into a single new basic block.

Differential Revision: https://reviews.llvm.org/D54164

llvm-svn: 346690
  • Loading branch information
rampitec committed Nov 12, 2018
1 parent 8512e59 commit e86c8d3
Show file tree
Hide file tree
Showing 4 changed files with 419 additions and 2 deletions.
97 changes: 97 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class SIInsertSkips : public MachineFunctionPass {

bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);

bool optimizeVccBranch(MachineInstr &MI) const;

public:
static char ID;

Expand Down Expand Up @@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
return true;
}

bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
// Match:
// sreg = -1
// vcc = S_AND_B64 exec, sreg
// S_CBRANCH_VCC[N]Z
// =>
// S_CBRANCH_EXEC[N]Z
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
const unsigned CondReg = AMDGPU::VCC;
const unsigned ExecReg = AMDGPU::EXEC;
const unsigned And = AMDGPU::S_AND_B64;

MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();
bool ReadsCond = false;
unsigned Threshold = 5;
for (++A ; A != E ; ++A) {
if (!--Threshold)
return false;
if (A->modifiesRegister(ExecReg, TRI))
return false;
if (A->modifiesRegister(CondReg, TRI)) {
if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
return false;
break;
}
ReadsCond |= A->readsRegister(CondReg, TRI);
}
if (A == E)
return false;

MachineOperand &Op1 = A->getOperand(1);
MachineOperand &Op2 = A->getOperand(2);
if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
TII->commuteInstruction(*A);
Changed = true;
}
if (Op1.getReg() != ExecReg)
return Changed;
if (Op2.isImm() && Op2.getImm() != -1)
return Changed;

unsigned SReg = AMDGPU::NoRegister;
if (Op2.isReg()) {
SReg = Op2.getReg();
auto M = std::next(A);
bool ReadsSreg = false;
for ( ; M != E ; ++M) {
if (M->definesRegister(SReg, TRI))
break;
if (M->modifiesRegister(SReg, TRI))
return Changed;
ReadsSreg |= M->readsRegister(SReg, TRI);
}
if (M == E ||
!M->isMoveImmediate() ||
!M->getOperand(1).isImm() ||
M->getOperand(1).getImm() != -1)
return Changed;
// First if sreg is only used in and instruction fold the immediate
// into that and.
if (!ReadsSreg && Op2.isKill()) {
A->getOperand(2).ChangeToImmediate(-1);
M->eraseFromParent();
}
}

if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
MI.killsRegister(CondReg, TRI))
A->eraseFromParent();

bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
if (SReg == ExecReg) {
if (IsVCCZ) {
MI.eraseFromParent();
return true;
}
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
} else {
MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
: AMDGPU::S_CBRANCH_EXECNZ));
}

MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
MI.addImplicitDefUseOperands(*MBB.getParent());

return true;
}

bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
Expand Down Expand Up @@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
}
break;

case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
MadeChange |= optimizeVccBranch(MI);
break;

default:
break;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ endif:
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: ;;#ASMEND
; GCN: s_cbranch_vccz [[RET]]
; GCN: s_cbranch_execz [[RET]]

; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/infinite-loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ loop:
; SI: s_and_b64 vcc, exec, -1
; SI: s_waitcnt lgkmcnt(0)
; SI: buffer_store_dword [[REG]]
; SI: s_cbranch_vccnz [[LOOP]]
; SI: s_cbranch_execnz [[LOOP]]

; SI: [[RET]]: ; %UnifiedReturnBlock
; SI: s_endpgm
Expand Down

0 comments on commit e86c8d3

Please sign in to comment.