Skip to content
165 changes: 143 additions & 22 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,7 @@ static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
}

static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
return LHS.isReg() &&
RHS.isReg() &&
LHS.getReg() == RHS.getReg() &&
return LHS.isReg() && RHS.isReg() && LHS.getReg() == RHS.getReg() &&
LHS.getSubReg() == RHS.getSubReg();
}

Expand Down Expand Up @@ -383,31 +381,128 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}

// The following functions are helpers for dealing with REG_SEQUENCE
// instructions. Those instructions are used to represent copies to
// subregisters in SSA form.
//
// This pass should be able to peak through REG_SEQUENCE
// instructions. An access to a subregister of a register defined
// by a REG_SEQUENCE should be handled as if the register
// that is being copied to the subregister was accessed.
// Consider the following example:
// %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0
// %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0
// %3:sreg_32 = S_MOV_B32 255
// %4:vgpr_32 = V_AND_B32_e64 %2, %3
// %5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %1, %4, 0
//
// The V_ADD_CO_U32_e64 instructions will be combined with the
// V_AND_B32_e64 into an SDWA instruction.
//
// If one or more of the operands of V_ADD_CO_U32_e64 are accessed
// through the subregisters of a REG_SEQUENCE as in the following
// variation of the previous example, the optimization should still be
// able to proceed in the same way:
//
// [...]
// %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
// %5:sreg_32 = S_MOV_B32 255
// %6:vgpr_32 = V_AND_B32_e64 %2, %5
// %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
// %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0
//
// To this end, the SDWASrcOperand implementation uses the following
// functions to find out the register that is used as the source of
// the subregister value and it uses this register directly instead of
// the REG_SEQUENCE subregister.

/// Return the subregister of the REG_SEQUENCE \p RegSeq
/// which is copied from \p Op, i.e. the operand following
/// \p Op in the operands of \p RegSeq, or nullopt if the
/// the \p Op is not an operand of \p RegSeq.
///
/// Example:
/// For the instruction REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1,
/// return %subreg.sub0 for \p Reg = %1 and %subreg.sub1 for \p Reg = %2.
static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This patch is inventing ever so slightly different versions of helpers which exist in some form or another

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the risk of doing so, but I could not find a comparable existing function. It seems that TargetInstrInfo::getRegSequenceInputs could be used, but this would require the TRI which I don't need here, the interface is a bit awkward (e.g. I would have to provide a DefIdx value which does not have any meaning) here and it would create a vector of all values although I just want the single register .... so it does not really seem right to me. Should I perhaps move this function to somewhere else for general use?

Register Reg) {
if (!RegSeq.isRegSequence())
return {};

auto *End = RegSeq.operands_end();
// Operand pair at indices (i+1, i+2) is (register, subregister)
for (auto *It = RegSeq.operands_begin() + 1; It != End; It += 2) {
if (It->getReg() == Reg)
return (It + 1)->getImm();
}

return {};
}

/// Return the single user of \p RegSeq which accesses the subregister
/// that copies from \p Reg. Returns nullptr if \p Reg is not used by
/// exactly one operand of \p RegSeq.
///
/// Example:
/// %0:vgpr_32 = IMPLICIT_DEF
/// %1:vpgr_32 = IMPLICIT_DEF
/// %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
/// %3:vgpr_32, %4:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, 2, 0, implicit
////
/// [...]
///
/// If \p RegSeq is the MI defining %2 and \p Reg = %0, the function
/// returns %3, provided that %2 has no other uses. For any other
/// register, it returns nullptr.
static MachineInstr *regSequenceFindSingleSubregUser(MachineInstr &RegSeq,
Register Reg,
MachineRegisterInfo *MRI) {
Register SeqReg = RegSeq.getOperand(0).getReg();
unsigned SubReg = *regSequenceFindSubreg(RegSeq, Reg);

MachineInstr *User = MRI->getOneNonDBGUser(SeqReg);
if (User)
for (auto &Op : User->operands())
if (Op.isReg() && Op.getReg() == SeqReg && Op.getSubReg() == SubReg)
return User;

return nullptr;
}

MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches) {
if (PotentialMatches != nullptr) {
// Fill out the map for all uses if all can be converted
MachineOperand *Reg = getReplacedOperand();
if (!Reg->isReg() || !Reg->isDef())
MachineOperand *Op = getReplacedOperand();
if (!Op->isReg() || !Op->isDef())
return nullptr;

for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
// Check that all instructions that use Reg can be converted
if (!isConvertibleToSDWA(UseMI, ST, TII) ||
!canCombineSelections(UseMI, TII))
Register Reg = Op->getReg();
MachineRegisterInfo *MRI = getMRI();

// Check that all instructions that use Reg can be converted
SmallVector<MachineInstr *, 4> Uses;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
// Allow for indirect uses through REG_SEQUENCE instructions:
// consider the user (which is assumed to be unique) of the
// subregister defined by Reg in UseMI as the user of Reg
// instead of UseMi if UseMI is a REG_SEQUENCE.
MachineInstr *SrcMI =
UseMI.isRegSequence()
? regSequenceFindSingleSubregUser(UseMI, Reg, MRI)
: &UseMI;
if (!SrcMI || !isConvertibleToSDWA(*SrcMI, ST, TII) ||
!canCombineSelections(*SrcMI, TII))
return nullptr;

Uses.push_back(SrcMI);
}
// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
// Should not get a subregister here
assert(isSameReg(UseMO, *Reg));
auto &PM = *PotentialMatches;
for (auto *Use : Uses)
PM[Use].push_back(this);

SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
MachineInstr *UseMI = UseMO.getParent();
potentialMatchesMap[UseMI].push_back(this);
}
return nullptr;
}

Expand All @@ -418,10 +513,36 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
return nullptr;

MachineInstr *Parent = PotentialMO->getParent();
if (Parent->isRegSequence()) {
Parent = regSequenceFindSingleSubregUser(
*Parent, getReplacedOperand()->getReg(), getMRI());
return Parent && canCombineSelections(*Parent, TII) ? Parent : nullptr;
}

return canCombineSelections(*Parent, TII) ? Parent : nullptr;
}

/// Returns true if \p RHS is either the same register as LHS or the
/// defining instruction of \p LHS is a REG_SEQUENCE in which \p
/// RHS occurs as the operand for the register that corresponds to the
/// subregister of LHS.
static bool isSameRegOrCopy(const MachineOperand &LHS,
const MachineOperand &RHS,
const MachineRegisterInfo *MRI) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const MachineRegisterInfo *MRI) {
const MachineRegisterInfo &MRI) {

if (isSameReg(LHS, RHS))
return true;

const MachineOperand *Def = findSingleRegDef(&LHS, MRI);
const MachineInstr *MI = Def ? Def->getParent() : nullptr;

// TODO Handle other copy-like instructions?
if (!MI || !MI->isRegSequence())
return false;

auto SubReg = regSequenceFindSubreg(*MI, RHS.getReg());
return SubReg && LHS.getSubReg() == SubReg;
Comment on lines +535 to +543
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If all you're doing is looking through copy-like instructions, perhaps this pass should not be directly responsible for this. I think either peephole-opt or SIFoldOperands should be pulling the subregister read through the reg_sequence. This change isn't core to the SDWA problem.

i.e.

    %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
    %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
    %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec

Should have been rewritten to directly read the reg_sequence inputs in the first place

    %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %1, %6, 0, implicit $exec

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try and see if I can get this to work! Sounds like a, potentially, much better approach to me.

}

bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
switch (MI.getOpcode()) {
case AMDGPU::V_CVT_F32_FP8_sdwa:
Expand Down Expand Up @@ -455,14 +576,13 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MachineOperand *SrcMods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
assert(Src && (Src->isReg() || Src->isImm()));
if (!isSameReg(*Src, *getReplacedOperand())) {
if (!isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// If this is not src0 then it could be src1
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);

if (!Src ||
!isSameReg(*Src, *getReplacedOperand())) {
if (!Src || !isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// It's possible this Src is a tied operand for
// UNUSED_PRESERVE, in which case we can either
// abandon the peephole attempt, or if legal we can
Expand Down Expand Up @@ -502,13 +622,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
!isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No tests with these special case users

// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
// src2. This is not allowed.
return false;
}

assert(isSameReg(*Src, *getReplacedOperand()) &&
MachineOperand &ReplacedOp = *getReplacedOperand();
assert(isSameRegOrCopy(*Src, ReplacedOp, getMRI()) &&
(IsPreserveSrc || (SrcSel && SrcMods)));
}
copyRegOperand(*Src, *getTargetOperand());
Expand Down
133 changes: 133 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck %s

---
name: sdwa_reg_sequence
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
; CHECK-LABEL: name: sdwa_reg_sequence
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE]].sub0, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
%2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
Comment on lines +31 to +32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These don't get converted to SDWA, so can you just use a copy from an argument as the placeholder input value

%3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
%5:sreg_32 = S_MOV_B32 255
%6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
%7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
%8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
%10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
%12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
%13:sreg_64 = IMPLICIT_DEF
%14:vreg_64 = COPY %13
GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
S_ENDPGM 0
...

---
name: sdwa_reg_sequence_composed_subregs
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-LABEL: name: sdwa_reg_sequence_composed_subregs
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub0, 10, 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub1, 20, 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[REG_SEQUENCE]].sub1, %subreg.sub1
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE1]].sub1, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:vreg_64 = COPY $vgpr1_vgpr2
%1:vgpr_32 = V_ADD_U32_e64 %0.sub0, 10, 0, implicit $exec
%2:vgpr_32 = V_ADD_U32_e64 %0.sub1, 20, 0, implicit $exec
%3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
%5:vreg_64 = REG_SEQUENCE %0.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
%6:sreg_32 = S_MOV_B32 255
%7:vgpr_32 = V_AND_B32_e64 killed %2, killed %6, implicit $exec
%8:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %3, %subreg.sub1
%9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub1, %8.sub0, 0, implicit $exec
%11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %10, 0, implicit $exec
%13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
%15:vreg_64 = COPY %13
GLOBAL_STORE_DWORDX2 killed %15, killed %13, 0, 0, implicit $exec :: (store (s64), addrspace 1)
S_ENDPGM 0
...


---
name: sdwa_reg_sequence_multiple_uses
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
; CHECK-LABEL: name: sdwa_reg_sequence_multiple_uses
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, [[REG_SEQUENCE1]].sub0, 0, implicit $exec
; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
%2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
%3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
%5:sreg_32 = S_MOV_B32 255
%6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
%7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
%8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
%10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
%12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
%13:sreg_64 = IMPLICIT_DEF
%14:vreg_64 = COPY %13
%15:vgpr_32 = COPY %6
GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
S_ENDPGM 0
...