Skip to content

Commit

Permalink
[AMDGPU] Propagate AGPR RC from PHI to its PHI operands
Browse files Browse the repository at this point in the history
We can fix register class of PHI based on its all AGPR uses.
That leaves behind all PHIs which were already processed
earlier. Propagate RC back to PHI operands of a PHI.

Differential Revision: https://reviews.llvm.org/D77344
  • Loading branch information
rampitec committed Apr 3, 2020
1 parent b4b7c98 commit 0462795
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Expand Up @@ -766,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
SmallSet<const MachineInstr *, 4> Visited;
SetVector<MachineInstr *> PHIOperands;
worklist.insert(&MI);
Visited.insert(&MI);
while (!worklist.empty()) {
Expand Down Expand Up @@ -810,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
if (DefMI && DefMI->isPHI())
PHIOperands.insert(DefMI);
}
}

bool hasVGPRInput = false;
Expand Down Expand Up @@ -845,4 +851,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
TII->legalizeOperands(MI, MDT);
}

// Propagate register class back to PHI operands which are PHI themselves.
while (!PHIOperands.empty()) {
processPHINode(*PHIOperands.pop_back_val());
}
}
45 changes: 45 additions & 0 deletions llvm/test/CodeGen/AMDGPU/mfma-loop.ll
Expand Up @@ -487,5 +487,50 @@ exit:
ret void
}

; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}

; Check that we do not copy agprs to vgprs and back in an outer loop.

; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: [[INNER_LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[INNER_LOOP]]
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[OUTER_LOOP]]

; Final result should be read only once after the loop.

; GCN-COUNT-32: v_accvgpr_read_b32

define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
entry:
br label %for.cond.preheader

for.cond.preheader:
%phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
%c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
br label %inner.for.cond.preheader

inner.for.cond.preheader:
%phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
%c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
%inc = add nuw nsw i32 %c, 1
%cc = icmp eq i32 %inc, 16
br i1 %cc, label %inner.exit, label %inner.for.cond.preheader

inner.exit:
%inc.0 = add nuw nsw i32 %c.0, 1
%cc.0 = icmp eq i32 %inc.0, 16
br i1 %cc.0, label %exit, label %for.cond.preheader

exit:
store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
ret void
}

declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare i32 @llvm.amdgcn.workitem.id.x()

0 comments on commit 0462795

Please sign in to comment.