diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 67b91e14fcca8a..3f1e980627d889 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -666,32 +666,34 @@ void SIFoldOperands::foldOperand( return; const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { - MachineRegisterInfo::use_iterator NextUse; - SmallVector CopyUses; - for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg), - E = MRI->use_end(); - Use != E; Use = NextUse) { - NextUse = std::next(Use); - // There's no point trying to fold into an implicit operand. - if (Use->isImplicit()) - continue; - - FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(), - &UseMI->getOperand(1)); - CopyUses.push_back(FC); - } - for (auto &F : CopyUses) { - foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace); + if (!DestReg.isPhysical()) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { + MachineRegisterInfo::use_iterator NextUse; + SmallVector CopyUses; + for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg), + E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + // There's no point trying to fold into an implicit operand. + if (Use->isImplicit()) + continue; + + FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(), + &UseMI->getOperand(1)); + CopyUses.push_back(FC); + } + for (auto &F : CopyUses) { + foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace); + } } - } - if (DestRC == &AMDGPU::AGPR_32RegClass && - TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); - CopiesToReplace.push_back(UseMI); - return; + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } } // In order to fold immediates into copies, we need to change the diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir index 9164e5e2679140..e921248cc32504 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -87,3 +87,26 @@ body: | S_ENDPGM 0, implicit $vgpr0 ... + +# The users of $vgpr1 should not be visited for further immediate +# folding. + +# GCN-LABEL: name: no_fold_physreg_users_vgpr{{$}} +# GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit-def $vgpr1 +# GCN-NEXT: %2:vgpr_32 = COPY $vgpr1 +# GCN-NEXT: $vgpr2 = COPY %2 +--- +name: no_fold_physreg_users_vgpr +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 0 + %1:vgpr_32 = COPY %0 + $vgpr1 = COPY %0 + S_NOP 0, implicit-def $vgpr1 + %2:vgpr_32 = COPY $vgpr1 + $vgpr2 = COPY %2 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll new file mode 100644 index 00000000000000..6995cf6845553b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Make sure the return value of the first call is not overwritten with +; a constant before the fadd use. + +; CHECK-LABEL: vgpr_multi_use_imm_fold: +; CHECK: v_mov_b32_e32 v0, 0{{$}} +; CHECK: v_mov_b32_e32 v1, 2.0{{$}} +; CHECK: s_swappc_b64 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 +; CHECK: s_swappc_b64 +define amdgpu_kernel void @vgpr_multi_use_imm_fold() { +entry: + store double 0.0, double addrspace(1)* undef, align 8 + %call0 = tail call fastcc double @__ocml_log_f64(double 2.0) + %op = fadd double %call0, 0.0 + %call1 = tail call fastcc double @__ocml_sqrt_f64(double %op) + ret void +} + +declare hidden fastcc double @__ocml_log_f64(double) +declare hidden fastcc double @__ocml_sqrt_f64(double)