From f9253b672056f2d7f223396bfb87cbb4df042f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 23 Sep 2025 19:08:52 -0700 Subject: [PATCH 1/2] CodeGen: Handle bundled instructions in two-address-instructions pass If the instruction with tied operands is a BUNDLE instruction and we handle it by replacing an operand, then we need to update the corresponding internal operands as well. Otherwise, the resulting MIR is invalid. The test case is degenerate in the sense that the bundle only contains a single instruction, but it is sufficient to exercise this issue. commit-id:6760a9b7 --- .../lib/CodeGen/TwoAddressInstructionPass.cpp | 11 ++++ llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 414e414738b71..b99e1c7f19b71 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1665,6 +1665,17 @@ void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI, // by SubRegB is compatible with RegA with no subregister. So regardless of // whether the dest oper writes a subreg, the source oper should not. MO.setSubReg(0); + + // Update uses of RegB to uses of RegA inside the bundle. + if (MI->isBundle()) { + for (MachineOperand &MO : mi_bundle_ops(*MI)) { + if (MO.isReg() && MO.getReg() == RegB) { + assert(MO.getSubReg() == 0 && SubRegB == 0 && + "tied subregister uses in bundled instructions not supported"); + MO.setReg(RegA); + } + } + } } if (AllUsesCopied) { diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir new file mode 100644 index 0000000000000..696962a88c8b8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir @@ -0,0 +1,57 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s + +# Exercise very basic handling of BUNDLE'd instructions by the two-address-instruction pass. + +# This test is an example where it is best to keep the two-address instruction +# and resolve the tie with a COPY that is expected to be coalesced. +--- +name: test_fmac_bundle +body: | + bb.0: + + ; GCN-LABEL: name: test_fmac_bundle + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] + ; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec { + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec + ; GCN-NEXT: } + %10:vgpr_32 = COPY $vgpr0 + %11:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed %2(tied-def 0), implicit $mode, implicit $exec { + %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec + } + +... + +# This test is an example where conversion to three-address form would be beneficial. +--- +name: test_fmac_reuse_bundle +body: | + bb.0: + + ; GCN-LABEL: name: test_fmac_reuse_bundle + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec { + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec + ; GCN-NEXT: } + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + %2:vgpr_32 = COPY $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit %2(tied-def 0), implicit $mode, implicit $exec { + %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec + } + %4:vgpr_32 = V_ADD_U32_e64 %3, %2, 0, implicit $exec + +... From cc06ca25470188cc8e767eab72fcfe83958cf4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 7 Oct 2025 12:17:02 -0700 Subject: [PATCH 2/2] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions This is in preparation for future changes in AMDGPU that will make more substantial use of bundles pre-RA. For now, simply test this with degenerate (single-instruction) bundles. commit-id:4a30cb78 --- .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 +++++++++-------- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 58 +++++++++++++++++-- llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++- 3 files changed, 89 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index b99e1c7f19b71..562a6a00045f5 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr( if (!NewMI) return false; - LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi); - LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI); - - // If the old instruction is debug value tracked, an update is required. - if (auto OldInstrNum = mi->peekDebugInstrNum()) { - assert(mi->getNumExplicitDefs() == 1); - assert(NewMI->getNumExplicitDefs() == 1); - - // Find the old and new def location. - unsigned OldIdx = mi->defs().begin()->getOperandNo(); - unsigned NewIdx = NewMI->defs().begin()->getOperandNo(); - - // Record that one def has been replaced by the other. - unsigned NewInstrNum = NewMI->getDebugInstrNum(); - MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx), - std::make_pair(NewInstrNum, NewIdx)); - } - - MBB->erase(mi); // Nuke the old inst. - for (MachineInstr &MI : MIS) DistanceMap.insert(std::make_pair(&MI, Dist++)); - Dist--; + + if (&*mi == NewMI) { + LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi); + } else { + LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi); + LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI); + + // If the old instruction is debug value tracked, an update is required. + if (auto OldInstrNum = mi->peekDebugInstrNum()) { + assert(mi->getNumExplicitDefs() == 1); + assert(NewMI->getNumExplicitDefs() == 1); + + // Find the old and new def location. + unsigned OldIdx = mi->defs().begin()->getOperandNo(); + unsigned NewIdx = NewMI->defs().begin()->getOperandNo(); + + // Record that one def has been replaced by the other. + unsigned NewInstrNum = NewMI->getDebugInstrNum(); + MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx), + std::make_pair(NewInstrNum, NewIdx)); + } + + MBB->erase(mi); // Nuke the old inst. + Dist--; + } + mi = NewMI; nmi = std::next(mi); @@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist); + // Give targets a chance to convert bundled instructions. + bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle); + // If the instruction is convertible to 3 Addr, instead // of returning try 3 Addr transformation aggressively and // use this variable to check later. Because it might be better. @@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( // addl %esi, %edi // movl %edi, %eax // ret - if (Commuted && !MI.isConvertibleTo3Addr()) + if (Commuted && !ConvertibleTo3Addr) return false; if (shouldOnlyCommute) @@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( regBKilled = isKilled(MI, regB, true); } - if (MI.isConvertibleTo3Addr()) { + if (ConvertibleTo3Addr) { // This instruction is potentially convertible to a true // three-address instruction. Check if it is profitable. if (!regBKilled || isProfitableToConv3Addr(regA, regB)) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6ce18ea921a9b..1fde07190339c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4047,10 +4047,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr *CandidateMI = &MI; + + if (MI.isBundle()) { + // This is a temporary placeholder for bundle handling that enables us to + // exercise the relevant code paths in the two-address instruction pass. + if (MI.getBundleSize() != 1) + return nullptr; + CandidateMI = MI.getNextNode(); + } + ThreeAddressUpdates U; - MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); + MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U); + if (!NewMI) + return nullptr; - if (NewMI) { + if (MI.isBundle()) { + CandidateMI->eraseFromBundle(); + + for (MachineOperand &MO : MI.all_defs()) { + if (MO.isTied()) + MI.untieRegOperand(MO.getOperandNo()); + } + } else { updateLiveVariables(LV, MI, *NewMI); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -4091,7 +4110,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LV->getVarInfo(DefReg).AliveBlocks.clear(); } - if (LIS) { + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MO : MI.all_uses()) { + if (MO.isReg() && MO.getReg() == DefReg) { + assert(MO.getSubReg() == 0 && + "tied sub-registers in bundles currently not supported"); + MI.removeOperand(MO.getOperandNo()); + break; + } + } + + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(DefReg)); + } + } else if (LIS) { LiveInterval &DefLI = LIS->getInterval(DefReg); // We cannot delete the original instruction here, so hack out the use @@ -4106,11 +4140,27 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + } + + auto MO = MachineOperand::CreateReg(DummyReg, false); + MO.setIsUndef(true); + MI.addOperand(MO); + } + LIS->shrinkToUses(&DefLI); } } - return NewMI; + return MI.isBundle() ? &MI : NewMI; } MachineInstr * diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir index 696962a88c8b8..8ae50d8e0e071 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir @@ -31,7 +31,7 @@ body: | ... -# This test is an example where conversion to three-address form would be beneficial. +# This test is an example where conversion to three-address form is beneficial. --- name: test_fmac_reuse_bundle body: | @@ -41,11 +41,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec { - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def %3, implicit [[DEF]], implicit [[DEF1]], implicit [[COPY]], implicit $mode, implicit $exec { + ; GCN-NEXT: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F32_e64 0, killed [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: } - ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FMA_F32_e64_]], [[COPY]], 0, implicit $exec %2:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF