Skip to content

Commit

Permalink
[AMDGPU] Peephole adjacent equivalent S_SET_GPR_IDX_ON
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D79907
  • Loading branch information
rampitec committed May 14, 2020
1 parent e9802aa commit 7d16a22
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 20 deletions.
101 changes: 91 additions & 10 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Expand Up @@ -30,6 +30,7 @@ class SIPreEmitPeephole : public MachineFunctionPass {
const SIRegisterInfo *TRI = nullptr;

bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;

public:
static char ID;
Expand Down Expand Up @@ -143,25 +144,105 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
return true;
}

bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
const MachineFunction &MF = *MBB.getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
SmallVector<MachineInstr *, 4> ToRemove;
bool IdxOn = true;

if (!MI.isIdenticalTo(First))
return false;

// Scan back to find an identical S_SET_GPR_IDX_ON
for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
E = MI.getIterator(); I != E; ++I) {
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
return false;
case AMDGPU::S_SET_GPR_IDX_OFF:
IdxOn = false;
ToRemove.push_back(&*I);
break;
default:
if (I->modifiesRegister(AMDGPU::M0, TRI))
return false;
if (IdxReg && I->modifiesRegister(IdxReg, TRI))
return false;
if (llvm::any_of(I->operands(),
[&MRI, this](const MachineOperand &MO) {
return MO.isReg() &&
TRI->isVectorRegister(MRI, MO.getReg());
})) {
// The only exception allowed here is another indirect V_MOV_B32_e32
// with the same mode.
if (!IdxOn || I->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
!I->hasRegisterImplicitUseOperand(AMDGPU::M0))
return false;
}
}
}

MI.eraseFromParent();
for (MachineInstr *RI : ToRemove)
RI->eraseFromParent();
return true;
}

bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
bool Changed = false;

for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
if (MBBI == MBB.end())
MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
if (MBBE != MBB.end()) {
MachineInstr &MI = *MBBE;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
continue;
default:
break;
}
}

if (!ST.hasVGPRIndexMode())
continue;

MachineInstr &MI = *MBBI;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
break;
default:
break;
MachineInstr *SetGPRMI = nullptr;
const unsigned Threshold = 20;
unsigned Count = 0;
// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
// second is not needed. Do expensive checks in the optimizeSetGPR()
// and limit the distance to 20 instructions for compile time purposes.
for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
MachineInstr &MI = *MBBI;
++MBBI;

if (Count == Threshold)
SetGPRMI = nullptr;
else
++Count;

if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
continue;

Count = 0;
if (!SetGPRMI) {
SetGPRMI = &MI;
continue;
}

if (optimizeSetGPR(*SetGPRMI, MI))
Changed = true;
else
SetGPRMI = &MI;
}
}

Expand Down
10 changes: 0 additions & 10 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
Expand Up @@ -428,8 +428,6 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
Expand Down Expand Up @@ -470,8 +468,6 @@ define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) {
; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v16, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[16:17], off
Expand Down Expand Up @@ -1036,8 +1032,6 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
Expand Down Expand Up @@ -1164,8 +1158,6 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
Expand Down Expand Up @@ -1298,8 +1290,6 @@ define amdgpu_ps double @dyn_extract_v16f64_v_s(<16 x double> %vec, i32 inreg %s
; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v32, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v0, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: v_readfirstlane_b32 s0, v32
Expand Down

0 comments on commit 7d16a22

Please sign in to comment.