From dc0089eaa822ff84ad394d2f1299d05059314c9f Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 14 Oct 2025 17:28:43 -0500 Subject: [PATCH 1/9] add support for v_pk_mov unpacking, redundant code removal, bug fixes --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 185 +++--- ...ck-non-coissue-insts-post-ra-scheduler.mir | 601 +++++++++++++++++- 2 files changed, 699 insertions(+), 87 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index c684f9e3a6e2c..379dc4f282743 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -47,9 +47,6 @@ class SIPreEmitPeephole { const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -62,7 +59,7 @@ class SIPreEmitPeephole { // v_fma_f32 v1, v0, v2, v2 // Here, we have overwritten v0 before we use it. This function checks if // unpacking can lead to such a situation. - bool canUnpackingClobberRegister(const MachineInstr &MI); + bool canUnpackingClobberRegister(MachineInstr &MI); // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for // this transformation. @@ -456,22 +453,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, // If support is extended to new operations, add tests in // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} -bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { +bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); // Only the first register in the register pair needs to be checked due to the @@ -482,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { // Such scenarios can arise due to specific combinations of op_sel and // op_sel_hi modifiers. Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); - - const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0MO && Src0MO->isReg()) { - Register SrcReg0 = Src0MO->getReg(); - unsigned Src0Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); - Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) - : TRI->getSubReg(SrcReg0, AMDGPU::sub0); - // Check if the register selected by op_sel_hi is the same as the first - // register in the destination register pair. - if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) - return true; - } - + uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI); + bool UnpackedInstHasOneSrcOp = + !AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1); const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1MO && Src1MO->isReg()) { Register SrcReg1 = Src1MO->getReg(); @@ -505,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) : TRI->getSubReg(SrcReg1, AMDGPU::sub0); + // Check if the register selected by op_sel_hi is the same as the first + // register in the destination register pair. if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) return true; } - // Applicable for packed instructions with 3 source operands, such as - // V_PK_FMA. - if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *Src2MO = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2MO && Src2MO->isReg()) { - Register SrcReg2 = Src2MO->getReg(); - unsigned Src2Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); - Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) - : TRI->getSubReg(SrcReg2, AMDGPU::sub0); - if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + // V_MOV_B32s have one src operand. Other candidate unpacked instructions with + // 2 or more src operands will perform the following checks. + if (!UnpackedInstHasOneSrcOp) { + const MachineOperand *Src0MO = + TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0MO && Src0MO->isReg()) { + Register SrcReg0 = Src0MO->getReg(); + unsigned Src0Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); + Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) + : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) return true; } + + // Applicable for packed instructions with 3 source operands, such as + // V_PK_FMA. + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *Src2MO = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2MO && Src2MO->isReg()) { + Register SrcReg2 = Src2MO->getReg(); + unsigned Src2Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); + Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) + : TRI->getSubReg(SrcReg2, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + return true; + } + } } return false; } @@ -540,6 +530,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { return AMDGPU::V_MUL_F32_e64; case AMDGPU::V_PK_FMA_F32: return AMDGPU::V_FMA_F32_e64; + case AMDGPU::V_PK_MOV_B32: + // Source modifiers aren't handled for MOV due to prevailing restrictions. + // Hence, 64-bit encoding isn't necessary. Will create unnecessary + // instruction cache pressure. + return AMDGPU::V_MOV_B32_e32; default: return std::numeric_limits::max(); } @@ -549,6 +544,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods, bool IsHiBits, const MachineOperand &SrcMO) { + unsigned NewOpCode = NewMI->getOpcode(); unsigned NewSrcMods = 0; unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; @@ -561,12 +557,18 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, // modifier for the higher 32 bits. Unpacked VOP3 instructions support // ABS, but do not support NEG_HI. Therefore we need to explicitly add the // NEG modifier if present in the packed instruction. + bool IsSrcModifidiersSupported = + AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src0_modifiers); + bool UnpackedInstHasOneSrcOp = + !AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src1); + if (SrcMods & NegModifier) NewSrcMods |= SISrcMods::NEG; // Src modifiers. Only negative modifiers are added if needed. Unpacked // operations do not have op_sel, therefore it must be handled explicitly as // done below. - NewMI.addImm(NewSrcMods); + if (IsSrcModifidiersSupported) + NewMI.addImm(NewSrcMods); if (SrcMO.isImm()) { NewMI.addImm(SrcMO.getImm()); return; @@ -594,7 +596,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, bool OpSel = SrcMods & SISrcMods::OP_SEL_0; bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1; bool KillState = true; - if ((OpSel == OpSelHi) && !IsHiBits) + if ((OpSel == OpSelHi) && !IsHiBits && !UnpackedInstHasOneSrcOp) KillState = false; UnpackedSrcMO.setIsKill(KillState); } @@ -612,10 +614,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && + (UnpackedOpCode == std::numeric_limits::max())) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +643,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (UnpackedOpCode == std::numeric_limits::max()) continue; if (canUnpackingClobberRegister(Instr)) @@ -661,8 +665,28 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - assert(UnpackedOpcode != std::numeric_limits::max() && - "Unsupported Opcode"); + // V_MOV_B32 does not support source modifiers. Without source modifiers, we + // cannot be faithful to the packed instruction semantics in few cases. This + // is true when the packed instruction has NEG and NEG_HI modifiers. We should + // abort unpacking if: + // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI. + // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or + // NEG_HI. + // Packed instructions do not specify ABS modifiers, so we can safely ignore + // those. + if (!AMDGPU::hasNamedOperand(UnpackedOpcode, + AMDGPU::OpName::src0_modifiers)) { + unsigned Src0Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm(); + unsigned negMask0 = + (Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned negMask1 = + (Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG; + if ((Src0Mods & negMask0) || (Src1Mods & negMask1)) + return; + } MachineInstrBuilder Op0LOp1L = createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); @@ -689,8 +713,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -704,8 +728,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src0) && + AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src1)) { + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); + } else { + const MachineOperand *SrcMO = IsHiBits ? SrcMO1 : SrcMO0; + unsigned SrcMods = IsHiBits ? Src1Mods : Src0Mods; + addOperandAndMods(NewMI, SrcMods, IsHiBits, *SrcMO); + } if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { const MachineOperand *SrcMO3 = @@ -714,10 +745,12 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); } - NewMI.addImm(ClampVal); // clamp + if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::clamp)) + NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 // for this use case - NewMI.addImm(0); // omod + if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::omod)) + NewMI.addImm(0); // omod return NewMI; } @@ -789,22 +822,24 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. - for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA - auto SchedModel = TII->getSchedModel(); - SetVector InstrsToUnpack; - for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (!SIInstrInfo::isMFMA(MI)) - continue; - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - uint16_t NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); - } - for (MachineInstr *MI : InstrsToUnpack) { - performF32Unpacking(*MI); + if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) { + for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the + // compiler to co-issue unpacked instructions with MFMA + auto SchedModel = TII->getSchedModel(); + SetVector InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (!SIInstrInfo::isMFMA(MI)) + continue; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + uint16_t NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + for (MachineInstr *MI : InstrsToUnpack) { + performF32Unpacking(*MI); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 75ae76fdee19b..2f7c42f46e0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -641,9 +641,6 @@ body: | ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX950-NEXT: S_ENDPGM 0 ; ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped @@ -664,9 +661,6 @@ body: | ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped @@ -687,9 +681,6 @@ body: | ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -706,9 +697,6 @@ body: | $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -1167,3 +1155,592 @@ body: | $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_unpacking + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_with_neg_modifiers_not_unpacked_0 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_0 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 9, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_0 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 9, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_0 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 9, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 9, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_with_neg_modifiers_not_unpacked_1 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_1 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 10, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_1 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 10, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_with_neg_modifiers_not_unpacked_1 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 10, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 10, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_2 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_2 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_2 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_2 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 9, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 8, killed $vgpr0_vgpr1, 9, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_3 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_3 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_3 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_3 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 10, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 10, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_4 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_4 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 14, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_4 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 14, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_4 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 14, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 14, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_v_pk_mov_op_sel_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_v_pk_mov_op_sel_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_v_pk_mov_op_sel_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_v_pk_mov_op_sel_unpacking + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 12, killed $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec + $vgpr11 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_PK_MOV_B32 12, killed $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 From 8587628d8cd93754b5d5f1b3c001064ae0f7885f Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 14 Oct 2025 17:37:10 -0500 Subject: [PATCH 2/9] restoring deleted assert --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 379dc4f282743..2361c73f73481 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -665,6 +665,8 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + assert(UnpackedOpcode != std::numeric_limits::max() && + "Unsupported Opcode"); // V_MOV_B32 does not support source modifiers. Without source modifiers, we // cannot be faithful to the packed instruction semantics in few cases. This // is true when the packed instruction has NEG and NEG_HI modifiers. We should From 25b9c71035be05b2cb1cac3a175c4adbe55ab772 Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:02:08 -0600 Subject: [PATCH 3/9] Update test with suggestion Co-authored-by: Matt Arsenault --- .../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 2f7c42f46e0c2..992dbf7167b5e 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -1160,10 +1160,6 @@ body: | --- name: test_v_pk_mov_unpacking tracksRegLiveness: true - -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 From e2fbe0ce2b7579c4276700e6ceb9fc59fb83f3c2 Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:03:53 -0600 Subject: [PATCH 4/9] Apply suggestion from @arsenm Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 0da450e577832..33fe8975d45d1 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -616,7 +616,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( MachineInstr &Instr = *I; uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); bool IsUnpackable = - !(UnpackedOpCode == std::numeric_limits::max()); + UnpackedOpCode != std::numeric_limits::max(); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || From 08fdc0764b05e56e87f5b5bad54b15d2c61bd738 Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:06:23 -0600 Subject: [PATCH 5/9] Apply suggestion from @arsenm Co-authored-by: Matt Arsenault --- .../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 992dbf7167b5e..efdb5fb4b9bec 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -1161,7 +1161,7 @@ body: | name: test_v_pk_mov_unpacking tracksRegLiveness: true body: | - bb.0.entry: + bb.0: liveins: $sgpr4_sgpr5 ; GFX950-LABEL: name: test_v_pk_mov_unpacking ; GFX950: liveins: $sgpr4_sgpr5 From 44b3744c83f53f8a25deaf3315cea5bd18f966f9 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 5 Nov 2025 15:18:22 -0600 Subject: [PATCH 6/9] reduce code size, address code reviews --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 137 +++++++++---------- 1 file changed, 65 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 33fe8975d45d1..cb10216fbab3b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -59,7 +59,7 @@ class SIPreEmitPeephole { // v_fma_f32 v1, v0, v2, v2 // Here, we have overwritten v0 before we use it. This function checks if // unpacking can lead to such a situation. - bool canUnpackingClobberRegister(MachineInstr &MI); + bool canUnpackingClobberRegister(const MachineInstr &MI); // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for // this transformation. @@ -454,7 +454,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, // If support is extended to new operations, add tests in // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) { +bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); // Only the first register in the register pair needs to be checked due to the @@ -465,55 +465,48 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) { // Such scenarios can arise due to specific combinations of op_sel and // op_sel_hi modifiers. Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); - uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI); - bool UnpackedInstHasOneSrcOp = - !AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1); - const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1MO && Src1MO->isReg()) { - Register SrcReg1 = Src1MO->getReg(); - unsigned Src1Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); - Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) - : TRI->getSubReg(SrcReg1, AMDGPU::sub0); - // Check if the register selected by op_sel_hi is the same as the first - // register in the destination register pair. - if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) - return true; - } - // V_MOV_B32s have one src operand. Other candidate unpacked instructions with - // 2 or more src operands will perform the following checks. - if (!UnpackedInstHasOneSrcOp) { - const MachineOperand *Src0MO = - TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0MO && Src0MO->isReg()) { - Register SrcReg0 = Src0MO->getReg(); - unsigned Src0Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); - Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) - : TRI->getSubReg(SrcReg0, AMDGPU::sub0); - if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) - return true; - } + // Lambda to check if a source operand causes clobbering + auto checkSrcClobber = [&](AMDGPU::OpName SrcName, AMDGPU::OpName ModsName) -> bool { + const MachineOperand *SrcMO = TII->getNamedOperand(MI, SrcName); + if (SrcMO && SrcMO->isReg()) { + Register SrcReg = SrcMO->getReg(); + unsigned SrcMods = TII->getNamedOperand(MI, ModsName)->getImm(); + Register HiSrcReg = (SrcMods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg, AMDGPU::sub1) + : TRI->getSubReg(SrcReg, AMDGPU::sub0); + return TRI->regsOverlap(UnpackedDstReg, HiSrcReg); + } + }; + + // Src1 should be checked before src0 to avoid false positives. + // For example, the following unpacked sequence is legal: + // $vgpr0_vgpr1 = V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3 + // => + // $vgpr0 = V_MUL_F32 $vgpr0, $vgpr2 + // $vgpr1 = V_MUL_F32 $vgpr1, $vgpr3 + // Although the destination and source overlap in the first instruction ($vgpr0), $vgpr0 is not used as a source in the second instruction. + // Therefore, unpacking this sequence is safe. + // + // The following sequence, however, is not safe to unpack: + // $vgpr0_vgpr1 = V_PK_MUL_F32 0, $vgpr0_vgpr1, 8, $vgpr2_vgpr3 + // => + // $vgpr0 = V_MUL_F32 $vgpr0, $vgpr2 + // $vgpr1 = V_MUL_F32 $vgpr0, $vgpr3 + // In the unpacked version, $vgpr1 uses $vgpr0 as a source, but $vgpr0 was updated in the previous instruction. + // This behavior does not occur with the packed instruction. + // As a result, it is unsafe to unpack this sequence. + if (checkSrcClobber(AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers)) + return true; - // Applicable for packed instructions with 3 source operands, such as - // V_PK_FMA. - if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *Src2MO = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2MO && Src2MO->isReg()) { - Register SrcReg2 = Src2MO->getReg(); - unsigned Src2Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); - Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) - : TRI->getSubReg(SrcReg2, AMDGPU::sub0); - if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) - return true; - } - } + if (checkSrcClobber(AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers)) + return true; + + // Applicable for packed instructions with 3 source operands, such as + // V_PK_FMA. + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + if (checkSrcClobber(AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers)) + return true; } return false; } @@ -646,7 +639,29 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } if (!IsUnpackable) continue; - + + // V_MOV_B32 does not support source modifiers. Without source modifiers, we + // cannot be faithful to the packed instruction semantics in few cases. This + // is true when the packed instruction has NEG and NEG_HI modifiers. We should + // abort unpacking if: + // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI. + // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or + // NEG_HI. + // Packed instructions do not specify ABS modifiers, so we can safely ignore + // those. + if (!AMDGPU::hasNamedOperand(UnpackedOpCode, + AMDGPU::OpName::src0_modifiers)) { + unsigned Src0Mods = + TII->getNamedOperand(Instr, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(Instr, AMDGPU::OpName::src1_modifiers)->getImm(); + unsigned negMask0 = + (Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned negMask1 = + (Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG; + if ((Src0Mods & negMask0) || (Src1Mods & negMask1)) + return; + } if (canUnpackingClobberRegister(Instr)) return; // If it's a packed instruction, adjust latency: remove the packed @@ -667,28 +682,6 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits::max() && "Unsupported Opcode"); - // V_MOV_B32 does not support source modifiers. Without source modifiers, we - // cannot be faithful to the packed instruction semantics in few cases. This - // is true when the packed instruction has NEG and NEG_HI modifiers. We should - // abort unpacking if: - // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI. - // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or - // NEG_HI. - // Packed instructions do not specify ABS modifiers, so we can safely ignore - // those. - if (!AMDGPU::hasNamedOperand(UnpackedOpcode, - AMDGPU::OpName::src0_modifiers)) { - unsigned Src0Mods = - TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm(); - unsigned Src1Mods = - TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm(); - unsigned negMask0 = - (Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG; - unsigned negMask1 = - (Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG; - if ((Src0Mods & negMask0) || (Src1Mods & negMask1)) - return; - } MachineInstrBuilder Op0LOp1L = createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); From 75daff03e892b318550322bde3396ecb73a3fafc Mon Sep 17 00:00:00 2001 From: Akash Dutta <137309513+akadutta@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:19:43 -0600 Subject: [PATCH 7/9] Apply suggestion from @arsenm Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index cb10216fbab3b..e4c9338db343c 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -525,8 +525,6 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { return AMDGPU::V_FMA_F32_e64; case AMDGPU::V_PK_MOV_B32: // Source modifiers aren't handled for MOV due to prevailing restrictions. - // Hence, 64-bit encoding isn't necessary. Will create unnecessary - // instruction cache pressure. return AMDGPU::V_MOV_B32_e32; default: return std::numeric_limits::max(); From 53009ee42b5a41262438a4e072baad5a5e05f937 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 5 Nov 2025 16:41:22 -0600 Subject: [PATCH 8/9] code optimization, clang-format --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 63 ++++++++++---------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index cb10216fbab3b..871712184723b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -467,46 +467,48 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); // Lambda to check if a source operand causes clobbering - auto checkSrcClobber = [&](AMDGPU::OpName SrcName, AMDGPU::OpName ModsName) -> bool { + auto checkSrcClobber = [&](AMDGPU::OpName SrcName, + AMDGPU::OpName ModsName) -> bool { const MachineOperand *SrcMO = TII->getNamedOperand(MI, SrcName); if (SrcMO && SrcMO->isReg()) { Register SrcReg = SrcMO->getReg(); unsigned SrcMods = TII->getNamedOperand(MI, ModsName)->getImm(); Register HiSrcReg = (SrcMods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg, AMDGPU::sub1) - : TRI->getSubReg(SrcReg, AMDGPU::sub0); + ? TRI->getSubReg(SrcReg, AMDGPU::sub1) + : TRI->getSubReg(SrcReg, AMDGPU::sub0); return TRI->regsOverlap(UnpackedDstReg, HiSrcReg); - } + } }; - + // Src1 should be checked before src0 to avoid false positives. // For example, the following unpacked sequence is legal: // $vgpr0_vgpr1 = V_PK_MUL_F32 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3 // => // $vgpr0 = V_MUL_F32 $vgpr0, $vgpr2 // $vgpr1 = V_MUL_F32 $vgpr1, $vgpr3 - // Although the destination and source overlap in the first instruction ($vgpr0), $vgpr0 is not used as a source in the second instruction. + // Although the destination and source overlap in the first instruction + // ($vgpr0), $vgpr0 is not used as a source in the second instruction. // Therefore, unpacking this sequence is safe. - // + // // The following sequence, however, is not safe to unpack: // $vgpr0_vgpr1 = V_PK_MUL_F32 0, $vgpr0_vgpr1, 8, $vgpr2_vgpr3 // => // $vgpr0 = V_MUL_F32 $vgpr0, $vgpr2 // $vgpr1 = V_MUL_F32 $vgpr0, $vgpr3 - // In the unpacked version, $vgpr1 uses $vgpr0 as a source, but $vgpr0 was updated in the previous instruction. - // This behavior does not occur with the packed instruction. - // As a result, it is unsafe to unpack this sequence. + // In the unpacked version, $vgpr1 uses $vgpr0 as a source, but $vgpr0 was + // updated in the previous instruction. This behavior does not occur with the + // packed instruction. As a result, it is unsafe to unpack this sequence. if (checkSrcClobber(AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers)) return true; if (checkSrcClobber(AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers)) - return true; - + return true; + // Applicable for packed instructions with 3 source operands, such as // V_PK_FMA. if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { if (checkSrcClobber(AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers)) - return true; + return true; } return false; } @@ -607,15 +609,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; - uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); - bool IsUnpackable = - UnpackedOpCode != std::numeric_limits::max(); if (Instr.isMetaInstruction()) continue; - if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !IsUnpackable) || - (SIInstrInfo::modifiesModeRegister(Instr) && - Instr.modifiesRegister(AMDGPU::EXEC, TRI))) + if (Instr.isTerminator()) + return; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); + bool IsUnpackable = UnpackedOpCode != std::numeric_limits::max(); + if (TII->isNeverCoissue(Instr) && !IsUnpackable) return; const MCSchedClassDesc *InstrSchedClassDesc = @@ -631,26 +631,27 @@ void SIPreEmitPeephole::collectUnpackingCandidates( // transitive dependencies between the MFMA def and candidate instruction // def and uses. Conservatively ensures that we do not incorrectly // read/write registers. - for (const MachineOperand &InstrMO : Instr.operands()) { - if (!InstrMO.isReg() || !InstrMO.getReg().isValid()) - continue; - if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) - return; - } + if (llvm::any_of(Instr.operands(), [&](const MachineOperand &InstrMO) { + return InstrMO.isReg() && InstrMO.getReg().isValid() && + TRI->regsOverlap(MFMADef, InstrMO.getReg()); + })) + return; + if (!IsUnpackable) continue; - + // V_MOV_B32 does not support source modifiers. Without source modifiers, we // cannot be faithful to the packed instruction semantics in few cases. This - // is true when the packed instruction has NEG and NEG_HI modifiers. We should - // abort unpacking if: - // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI. + // is true when the packed instruction has NEG and NEG_HI modifiers. We + // should abort unpacking if: + // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or + // NEG_HI. // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or // NEG_HI. // Packed instructions do not specify ABS modifiers, so we can safely ignore // those. if (!AMDGPU::hasNamedOperand(UnpackedOpCode, - AMDGPU::OpName::src0_modifiers)) { + AMDGPU::OpName::src0_modifiers)) { unsigned Src0Mods = TII->getNamedOperand(Instr, AMDGPU::OpName::src0_modifiers)->getImm(); unsigned Src1Mods = From 6b43f6c61289d19dee5ea94c9bd9739c519fffaf Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 5 Nov 2025 17:21:17 -0600 Subject: [PATCH 9/9] add removed test, remove func liveins in test --- ...ck-non-coissue-insts-post-ra-scheduler.mir | 66 ++++--------------- 1 file changed, 12 insertions(+), 54 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index efdb5fb4b9bec..8d159bb87fe35 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -7,9 +7,6 @@ name: test_pk_mul_unpacking_f32 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -100,9 +97,6 @@ body: | name: test_op_sel_selection_unpacking_f32 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5'} - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -193,9 +187,6 @@ body: | name: test_op_sel_hi_selection_unpacking_f32 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5'} - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -286,9 +277,6 @@ body: | name: test_pk_add_unpacking_f32 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -443,9 +431,6 @@ body: | name: test_pk_fma_unpacking_f32 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -528,9 +513,6 @@ body: | name: test_unpacking_does_not_introduce_rw_dependency tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -617,9 +599,6 @@ body: | name: test_opcodes_not_supported_for_unpacking_are_skipped tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -641,6 +620,9 @@ body: | ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX950-NEXT: S_ENDPGM 0 ; ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped @@ -661,6 +643,9 @@ body: | ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped @@ -681,6 +666,9 @@ body: | ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -697,6 +685,9 @@ body: | $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -704,9 +695,6 @@ body: | name: test_opsel_register_is_correctly_marked_as_killed tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -797,9 +785,6 @@ body: | name: test_inst_dependent_on_mfma_are_not_unpacked tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -888,9 +873,6 @@ body: | name: test_mfma_def_using_instr_blocks_unpacking tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -975,9 +957,6 @@ body: | name: test_unpacking_with_imm_input tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1068,9 +1047,6 @@ body: | name: test_neg_lo_hi_post_unpacking tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1242,9 +1218,6 @@ body: | name: test_v_pk_mov_with_neg_modifiers_not_unpacked_0 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1325,9 +1298,6 @@ body: | name: test_v_pk_mov_with_neg_modifiers_not_unpacked_1 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1408,9 +1378,6 @@ body: | name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_2 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1493,9 +1460,6 @@ body: | name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_3 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1578,9 +1542,6 @@ body: | name: test_v_pk_mov_with_neg_modifiers_correctly_unpacked_4 tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5 @@ -1661,9 +1622,6 @@ body: | name: test_v_pk_mov_op_sel_unpacking tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - body: | bb.0.entry: liveins: $sgpr4_sgpr5