Skip to content

Commit

Permalink
[AMDGPU] Remove unnecessary v_mov from a register to itself in WQM lo…
Browse files Browse the repository at this point in the history
…wering.

Summary:
- SI Whole Quad Mode phase is replacing WQM pseudo instructions with v_mov instructions.
While this is necessary for the special handling of moving results out of WWM live ranges,
it is not necessary for WQM live ranges. The result is a v_mov from a register to itself after every
WQM operation. This change uses a COPY psuedo in these cases, which allows the register
allocator to coalesce the moves away.

Reviewers: tpr, dstuttard, foad, nhaehnle

Reviewed By: nhaehnle

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71386
  • Loading branch information
Michael Bedy committed Jan 11, 2020
1 parent 4134d70 commit 4a32cd1
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 5 deletions.
27 changes: 22 additions & 5 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Expand Up @@ -156,6 +156,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;

void printInfo();
Expand Down Expand Up @@ -352,7 +353,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// inactive lanes.
markInstructionUses(MI, StateWWM, Worklist);
GlobalFlags |= StateWWM;
LowerToCopyInstrs.push_back(&MI);
LowerToMovInstrs.push_back(&MI);
continue;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
Expand Down Expand Up @@ -852,9 +853,8 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
}

void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
for (MachineInstr *MI : LowerToMovInstrs) {
assert(MI->getNumExplicitOperands() == 2);

const Register Reg = MI->getOperand(0).getReg();

Expand All @@ -872,13 +872,30 @@ void SIWholeQuadMode::lowerCopyInstrs() {
MI->setDesc(TII->get(AMDGPU::COPY));
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
assert(MI->getNumExplicitOperands() == 3);
// the only reason we should be here is V_SET_INACTIVE has
// an undef input so it is being replaced by a simple copy.
// There should be a second undef source that we should remove.
assert(MI->getOperand(2).isUndef());
MI->RemoveOperand(2);
MI->untieRegOperand(1);
} else {
assert(MI->getNumExplicitOperands() == 2);
}

MI->setDesc(TII->get(AMDGPU::COPY));
}
}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
CallingConv = MF.getFunction().getCallingConv();

ST = &MF.getSubtarget<GCNSubtarget>();
Expand All @@ -893,7 +910,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(Exec);
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
return !LiveMaskQueries.empty();
} else {
// Store a copy of the original live mask when required
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wqm.ll
Expand Up @@ -117,6 +117,9 @@ main_body:
;CHECK: buffer_load_dword
;CHECK: buffer_load_dword
;CHECK: v_add_f32_e32
; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
; does not happen - the v_add should write the return reg directly.
;CHECK-NOT: v_mov_b32_e32
define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
main_body:
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
Expand Down
59 changes: 59 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wqm.mir
Expand Up @@ -48,3 +48,62 @@ body: |
SI_RETURN_TO_EPILOG $vgpr0
...

---
# V_SET_INACTIVE, when its second operand is undef, is replaced by a
# COPY by si-wqm. Ensure the instruction is removed.
#CHECK-NOT: V_SET_INACTIVE
name: no_cfg
alignment: 1
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers:
- { id: 0, class: sgpr_32, preferred-register: '' }
- { id: 1, class: sgpr_32, preferred-register: '' }
- { id: 2, class: sgpr_32, preferred-register: '' }
- { id: 3, class: sgpr_32, preferred-register: '' }
- { id: 4, class: sgpr_32, preferred-register: '' }
- { id: 5, class: sgpr_128, preferred-register: '' }
- { id: 6, class: sgpr_128, preferred-register: '' }
- { id: 7, class: sreg_32, preferred-register: '' }
- { id: 8, class: vreg_64, preferred-register: '' }
- { id: 9, class: sreg_32, preferred-register: '' }
- { id: 10, class: vgpr_32, preferred-register: '' }
- { id: 11, class: vgpr_32, preferred-register: '' }
- { id: 12, class: sreg_32, preferred-register: '' }
- { id: 13, class: vgpr_32, preferred-register: '' }
- { id: 14, class: vgpr_32, preferred-register: '' }
- { id: 15, class: vgpr_32, preferred-register: '' }
- { id: 16, class: vgpr_32, preferred-register: '' }
liveins:
- { reg: '$sgpr0', virtual-reg: '%0' }
- { reg: '$sgpr1', virtual-reg: '%1' }
- { reg: '$sgpr2', virtual-reg: '%2' }
- { reg: '$sgpr3', virtual-reg: '%3' }
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
%3:sgpr_32 = COPY $sgpr3
%2:sgpr_32 = COPY $sgpr2
%1:sgpr_32 = COPY $sgpr1
%0:sgpr_32 = COPY $sgpr0
%6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
%5:sgpr_128 = COPY %6
%7:sreg_32 = S_MOV_B32 0
%8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec
%16:vgpr_32 = COPY %8.sub1
%11:vgpr_32 = COPY %16
%10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec
%14:vgpr_32 = COPY %7
%13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec
BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0
...

0 comments on commit 4a32cd1

Please sign in to comment.