Skip to content

Commit

Permalink
AMDGPU: Track physical registers in SIWholeQuadMode
Browse files Browse the repository at this point in the history
Summary:
There are cases where uniform branch conditions are computed in VGPRs, and
we didn't correctly mark those as WQM.

The stray change in basic-branch.ll is because invoking the LiveIntervals
analysis leads to the detection of a dead register that would otherwise not
be seen at -O0.

This is a candidate for the 3.9 branch, as it fixes a possible hang.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: arsenm, llvm-commits, kzhuravl

Differential Revision: https://reviews.llvm.org/D22673

llvm-svn: 277500
  • Loading branch information
nhaehnle committed Aug 2, 2016
1 parent 91bdeb1 commit bef0e90
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 27 deletions.
79 changes: 53 additions & 26 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Expand Up @@ -94,12 +94,15 @@ class SIWholeQuadMode : public MachineFunctionPass {
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;

DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;

void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
Expand All @@ -126,6 +129,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
Expand All @@ -135,15 +139,35 @@ class SIWholeQuadMode : public MachineFunctionPass {

char SIWholeQuadMode::ID = 0;

INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
"SI Whole Quad Mode", false, false)
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)

char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;

FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}

void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];

assert(Flag == StateWQM || Flag == StateExact);

// Ignore if the instruction is already marked. The typical case is that we
// mark an instruction WQM multiple times, but for atomics it can happen that
// Flag is StateWQM, but Needs is already set to StateExact. In this case,
// letting the atomic run in StateExact is correct as per the relevant specs.
if (II.Needs)
return;

II.Needs = Flag;
Worklist.push_back(&MI);
}

// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Expand Down Expand Up @@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
continue;
}

Instructions[&MI].Needs = Flags;
Worklist.push_back(&MI);
markInstruction(MI, Flags, Worklist);
GlobalFlags |= Flags;
}

Expand Down Expand Up @@ -249,32 +272,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
if (!Use.isReg() || !Use.isUse())
continue;

// At this point, physical registers appear as inputs or outputs
// and following them makes no sense (and would in fact be incorrect
// when the same VGPR is used as both an output and an input that leads
// to a NeedsWQM instruction).
//
// Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
// have to trace this, in practice it happens for 64-bit computations like
// pointers where both dwords are followed already anyway.
if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
continue;

for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
InstrInfo &DefII = Instructions[&DefMI];
unsigned Reg = Use.getReg();

// Obviously skip if DefMI is already flagged as NeedWQM.
//
// The instruction might also be flagged as NeedExact. This happens when
// the result of an atomic is used in a WQM computation. In this case,
// the atomic must not run for helper pixels and the WQM result is
// undefined.
if (DefII.Needs != 0)
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
if (Reg == AMDGPU::EXEC)
continue;

DefII.Needs = StateWQM;
Worklist.push_back(&DefMI);
for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
LiveRange &LR = LIS->getRegUnit(*RegUnit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)
continue;

// Since we're in machine SSA, we do not need to track physical
// registers across basic blocks.
if (Value->isPHIDef())
continue;

markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
Worklist);
}

continue;
}

for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
markInstruction(DefMI, StateWQM, Worklist);
}
}

Expand Down Expand Up @@ -471,6 +497,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();

char GlobalFlags = analyzeFunction(MF);
if (!(GlobalFlags & StateWQM)) {
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/basic-branch.ll
Expand Up @@ -6,7 +6,6 @@
; GCN-LABEL: {{^}}test_branch:
; GCNNOOPT: v_writelane_b32
; GCNNOOPT: v_writelane_b32
; GCNNOOPT: v_writelane_b32
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]

; GCN: ; BB#1
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wqm.ll
Expand Up @@ -350,6 +350,44 @@ main_body:
ret float %s
}

; CHECK-LABEL: {{^}}test_loop_vcc:
; CHECK-NEXT: ; %entry
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK: image_store
; CHECK: s_wqm_b64 exec, exec
; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]

; CHECK: [[LOOPHDR]]: ; %loop
; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
; CHECK: s_cbranch_vccz
; CHECK: ; %break

; CHECK: ; return
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
entry:
call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
br label %loop

loop:
%ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
%cc = icmp sgt i32 %ctr.iv, 7
br i1 %cc, label %break, label %body

body:
%c.i = bitcast <4 x float> %c.iv to <4 x i32>
%c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%ctr.next = add i32 %ctr.iv, 2
br label %loop

break:
ret <4 x float> %c.iv
}

declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1

declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
Expand Down

0 comments on commit bef0e90

Please sign in to comment.