diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b3a76aa4046b3..65286751c12df 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -757,6 +757,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { unsigned numVGPRUses = 0; + bool AllAGPRUses = true; SetVector worklist; SmallSet Visited; worklist.insert(&MI); @@ -766,6 +767,9 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { unsigned Reg = Instr->getOperand(0).getReg(); for (const auto &Use : MRI->use_operands(Reg)) { const MachineInstr *UseMI = Use.getParent(); + AllAGPRUses &= (UseMI->isCopy() && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || + TRI->isAGPR(*MRI, Use.getReg()); if (UseMI->isCopy() || UseMI->isRegSequence()) { if (UseMI->isCopy() && UseMI->getOperand(0).getReg().isPhysical() && @@ -794,11 +798,19 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } } + + Register PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); + if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { + LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); + MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); + } + bool hasVGPRInput = false; for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { unsigned InputReg = MI.getOperand(i).getReg(); MachineInstr *Def = MRI->getVRegDef(InputReg); - if (TRI->isVGPR(*MRI, InputReg)) { + if (TRI->isVectorRegister(*MRI, InputReg)) { if (Def->isCopy()) { unsigned SrcReg = Def->getOperand(1).getReg(); const TargetRegisterClass *RC = @@ -810,15 +822,14 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { break; } else if (Def->isCopy() && - TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) { + TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { hasVGPRInput = true; break; } } - unsigned PHIRes = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); - if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) && + if ((!TRI->isVectorRegister(*MRI, PHIRes) && + RC0 != &AMDGPU::VReg_1RegClass) && (hasVGPRInput || numVGPRUses > 1)) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI); diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll new file mode 100644 index 0000000000000..02f7c9bcee712 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: +; GCN-COUNT32: v_accvgpr_write_b32 +; GCN: [[LOOP:BB[0-9_]+]]: +; GCN-NOT: v_accvgpr +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-COUNT32: v_accvgpr_read_b32 +define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { +entry: + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x()