diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 8d4cad4c07bc7..0c77fe7259588 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -104,6 +104,13 @@ def foldable_fneg : GICombineRule< [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +// Detects s_mul_u64 instructions whose higher bits are zero/sign extended. +def smulu64 : GICombineRule< + (defs root:$smul, unsigned_matchinfo:$matchinfo), + (match (wip_match_opcode G_MUL):$smul, + [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]), + (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>; + def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">; def sign_extension_in_reg : GICombineRule< @@ -149,7 +156,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, sign_extension_in_reg]> { + rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index dfbe5c7fed882..aa235c07e9959 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -701,13 +701,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .maxScalar(0, S32); } - getActionDefinitionsBuilder(G_MUL) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .custom(); + if (ST.hasScalarSMulU64()) { + getActionDefinitionsBuilder(G_MUL) + .legalFor({S64, S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + } else { + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + } assert(ST.hasMad64_32()); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 7b18e1f805d8f..21bfab52c6c4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -104,6 +104,14 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { void applyCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo) const; + // Find the s_mul_u64 instructions where the higher bits are either + // zero-extended or sign-extended. + bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; + // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher + // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 + // bits are zero extended. + void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; + private: #define GET_GICOMBINER_CLASS_MEMBERS #define AMDGPUSubtarget GCNSubtarget @@ -419,6 +427,32 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( + MachineInstr &MI, unsigned &NewOpcode) const { + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + if (MRI.getType(Src0) != LLT::scalar(64)) + return false; + + if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 && + KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) { + NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; + return true; + } + + if (KB->computeNumSignBits(Src1) >= 33 && + KB->computeNumSignBits(Src0) >= 33) { + NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; + return true; + } + return false; +} + +void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64( + MachineInstr &MI, unsigned &NewOpcode) const { + Helper.replaceOpcodeWith(MI, NewOpcode); +} + // Pass boilerplate // ================ diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 92182ec069426..ecb7bb9d1d975 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( return true; } +// Break s_mul_u64 into 32-bit vector operations. +void AMDGPURegisterBankInfo::applyMappingSMULU64( + MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector Src0Regs(OpdMapper.getVRegs(1)); + SmallVector Src1Regs(OpdMapper.getVRegs(2)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src0Regs.empty() && Src1Regs.empty()); + applyDefaultMapping(OpdMapper); + return; + } + + assert(DefRegs.size() == 2); + assert(Src0Regs.size() == Src1Regs.size() && + (Src0Regs.empty() || Src0Regs.size() == 2)); + + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + MachineInstr &MI = OpdMapper.getMI(); + Register DstReg = MI.getOperand(0).getReg(); + LLT HalfTy = LLT::scalar(32); + + // Depending on where the source registers came from, the generic code may + // have decided to split the inputs already or not. If not, we still need to + // extract the values. + + if (Src0Regs.empty()) + split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); + else + setRegsToType(MRI, Src0Regs, HalfTy); + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else + setRegsToType(MRI, Src1Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + // The multiplication is done as follows: + // + // Op1H Op1L + // * Op0H Op0L + // -------------------- + // Op1H*Op0L Op1L*Op0L + // + Op1H*Op0H Op1L*Op0H + // ----------------------------------------- + // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L + // + // We drop Op1H*Op0H because the result of the multiplication is a 64-bit + // value and that would overflow. + // The low 32-bit value is Op1L*Op0L. + // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from + // Op1L*Op0L). + + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + + Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); + Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); + Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); + Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); + B.buildAdd(DefRegs[1], Add, MulHiLo); + B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); + + MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); + MI.eraseFromParent(); +} + void AMDGPURegisterBankInfo::applyMappingImpl( MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); + // Special case for s_mul_u64. There is not a vector equivalent of + // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector + // multiplications. + if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + applyMappingSMULU64(B, OpdMapper); + return; + } + // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. // Packed 16-bit operations need to be scalarized and promoted. if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) break; const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::VGPRRegBank) break; @@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } + case AMDGPU::G_AMDGPU_S_MUL_I64_I32: + case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { + // This is a special case for s_mul_u64. We use + // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation + // where the 33 higher bits are sign-extended and + // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation + // where the 32 higher bits are zero-extended. In case scalar registers are + // selected, both opcodes are lowered as s_mul_u64. If the vector registers + // are selected, then G_AMDGPU_S_MUL_I64_I32 and + // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. + + // Insert basic copies. + applyDefaultMapping(OpdMapper); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg0 = MI.getOperand(1).getReg(); + Register SrcReg1 = MI.getOperand(2).getReg(); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " + "that handles only 64-bit operands."); + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + + // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 + // with s_mul_u64 operation. + if (DstBank == &AMDGPU::SGPRRegBank) { + MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); + MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); + MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); + MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); + return; + } + + // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 + // with a vector mad. + assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && + "The destination operand should be in vector registers."); + + DebugLoc DL = MI.getDebugLoc(); + + // Extract the lower subregister from the first operand. + Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); + MRI.setType(Op0L, S32); + B.buildTrunc(Op0L, SrcReg0); + + // Extract the lower subregister from the second operand. + Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); + MRI.setType(Op1L, S32); + B.buildTrunc(Op1L, SrcReg1); + + unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 + ? AMDGPU::G_AMDGPU_MAD_U64_U32 + : AMDGPU::G_AMDGPU_MAD_I64_I32; + + MachineIRBuilder B(MI); + Register Zero64 = B.buildConstant(S64, 0).getReg(0); + MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); + Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); + B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); + MI.eraseFromParent(); + return; + } case AMDGPU::G_SEXT_INREG: { SmallVector SrcRegs(OpdMapper.getVRegs(1)); if (SrcRegs.empty()) @@ -3669,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AND: case AMDGPU::G_OR: - case AMDGPU::G_XOR: { + case AMDGPU::G_XOR: + case AMDGPU::G_MUL: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); if (Size == 1) { const RegisterBank *DstBank @@ -3737,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_PTRMASK: case AMDGPU::G_ADD: case AMDGPU::G_SUB: - case AMDGPU::G_MUL: case AMDGPU::G_SHL: case AMDGPU::G_LSHR: case AMDGPU::G_ASHR: @@ -3755,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: + case AMDGPU::G_AMDGPU_S_MUL_I64_I32: + case AMDGPU::G_AMDGPU_S_MUL_U64_U32: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index b5d16e70ab23a..2bb5ef57fe031 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -84,6 +84,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo { bool applyMappingMAD_64_32(MachineIRBuilder &B, const OperandsMapper &OpdMapper) const; + void applyMappingSMULU64(MachineIRBuilder &B, + const OperandsMapper &OpdMapper) const; + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ce3164d7b92e5..f6f37f5170a40 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -683,6 +683,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } + bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } + bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e865c73015d29..209debb3a1058 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -835,6 +835,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); + if (Subtarget->hasScalarSMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Custom); + if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); @@ -5566,7 +5569,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL: case ISD::ADD: case ISD::SUB: - case ISD::MUL: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -5580,6 +5582,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SADDSAT: case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); + case ISD::MUL: + return lowerMUL(Op, DAG); case ISD::SMULO: case ISD::UMULO: return lowerXMULO(Op, DAG); @@ -6235,6 +6239,66 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); } +// Custom lowering for vector multiplications and s_mul_u64. +SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // Split vector operands. + if (VT.isVector()) + return splitBinaryVectorOp(Op, DAG); + + assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); + + // There are four ways to lower s_mul_u64: + // + // 1. If all the operands are uniform, then we lower it as it is. + // + // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit + // multiplications because there is not a vector equivalent of s_mul_u64. + // + // 3. If the cost model decides that it is more efficient to use vector + // registers, then we have to split s_mul_u64 in 32-bit multiplications. + // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . + // + // 4. If the cost model decides to use vector registers and both of the + // operands are zero-extended/sign-extended from 32-bits, then we split the + // s_mul_u64 in two 32-bit multiplications. The problem is that it is not + // possible to check if the operands are zero-extended or sign-extended in + // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with + // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace + // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. + // If the cost model decides that we have to use vector registers, then + // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ + // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model + // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ + // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in + // SIInstrInfo.cpp . + + if (Op->isDivergent()) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 + // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to + // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. + KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); + unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); + KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); + unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); + SDLoc SL(Op); + if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) + return SDValue( + DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); + unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); + if (Op0SignBits >= 33 && Op1SignBits >= 33) + return SDValue( + DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); + // If all the operands are uniform, then we lower s_mul_u64 as it is. + return Op; +} + SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 00f9ddf11ea7a..92b38ebade621 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 67992929ab356..d4c7a457e9aae 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + MI.setDesc(get(AMDGPU::S_MUL_U64)); + break; } return true; } @@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Default handling break; } + + case AMDGPU::S_MUL_U64: + // Split s_mul_u64 in 32-bit vector multiplications. + splitScalarSMulU64(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + // This is a special case of s_mul_u64 where all the operands are either + // zero extended or sign extended. + splitScalarSMulPseudo(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); @@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +// There is not a vector equivalent of s_mul_u64. For this reason, we need to +// split the s_mul_u64 in 32-bit vector multiplications. +void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand Op0H = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Op1H = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + // The multilication is done as follows: + // + // Op1H Op1L + // * Op0H Op0L + // -------------------- + // Op1H*Op0L Op1L*Op0L + // + Op1H*Op0H Op1L*Op0H + // ----------------------------------------- + // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L + // + // We drop Op1H*Op0H because the result of the multiplication is a 64-bit + // value and that would overflow. + // The low 32-bit value is Op1L*Op0L. + // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). + + Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1L_Op0H = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) + .add(Op1L) + .add(Op0H); + + Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1H_Op0L = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) + .add(Op1H) + .add(Op0L); + + Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Carry = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) + .add(Op1L) + .add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) + .addReg(Op1L_Op0H_Reg) + .addReg(Op1H_Op0L_Reg); + + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) + .addReg(AddReg) + .addReg(CarryReg); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*Op1L_Op0H, MDT); + legalizeOperands(*Op1H_Op0L, MDT); + legalizeOperands(*Carry, MDT); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*Add, MDT); + legalizeOperands(*HiHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector +// multiplications. +void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + unsigned Opc = Inst.getOpcode(); + unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO + ? AMDGPU::V_MUL_HI_U32_e64 + : AMDGPU::V_MUL_HI_I32_e64; + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*HiHalf, MDT); + legalizeOperands(*LoHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 46eee6fae0a52..37ee159362a28 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -138,6 +138,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; + void splitScalarSMulU64(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const; + + void splitScalarSMulPseudo(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const; + void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b0b7854ffc067..1cd8a37c3aa99 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3853,6 +3853,18 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { let mayStore = 0; } +def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index c9687ac368d30..5f021307e18ee 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -673,6 +673,16 @@ let SubtargetPredicate = isGFX12Plus in { let isCommutable = 1; } + // The higher 32-bits of the inputs contain the sign extension bits. + def S_MUL_I64_I32_PSEUDO : SPseudoInstSI < + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + >; + + // The higher 32-bits of the inputs are zero. + def S_MUL_U64_U32_PSEUDO : SPseudoInstSI < + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + >; + } // End SubtargetPredicate = isGFX12Plus let Uses = [SCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir index 5b0ed61a3313b..2bf8649e76242 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -1,9 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX89,GFX8 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX89,GFX9PLUS %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX1011 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX1011 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX9PLUS,GFX12 %s --- name: test_mul_s32 @@ -11,34 +12,13 @@ body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: test_mul_s32 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX6-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX8-LABEL: name: test_mul_s32 - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX9-LABEL: name: test_mul_s32 - ; GFX9: liveins: $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX10-LABEL: name: test_mul_s32 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](s32) + ; GCN-LABEL: name: test_mul_s32 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] + ; GCN-NEXT: $vgpr0 = COPY [[MUL]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_MUL %0, %1 @@ -51,50 +31,17 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6-LABEL: name: test_mul_v2s32 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] - ; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] - ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX8-LABEL: name: test_mul_v2s32 - ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] - ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX9-LABEL: name: test_mul_v2s32 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; GFX10-LABEL: name: test_mul_v2s32 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GCN-LABEL: name: test_mul_v2s32 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GCN-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GCN-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32) + ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_MUL %0, %1 @@ -122,54 +69,48 @@ body: | ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX8-LABEL: name: test_mul_s64 - ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX9-LABEL: name: test_mul_s64 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX10-LABEL: name: test_mul_s64 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX89-LABEL: name: test_mul_s64 + ; GFX89: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX89-NEXT: {{ $}} + ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] + ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) + ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) + ; GFX89-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX1011-LABEL: name: test_mul_s64 + ; GFX1011: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1011-NEXT: {{ $}} + ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] + ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]] + ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] + ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32) + ; GFX1011-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX12-LABEL: name: test_mul_s64 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]] + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_MUL %0, %1 @@ -209,90 +150,76 @@ body: | ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL3]](s32), [[ADD3]](s32) ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; GFX8-LABEL: name: test_mul_v2s64 - ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) - ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) - ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]] - ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32) - ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]] - ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) - ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]] - ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) - ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32) - ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; GFX9-LABEL: name: test_mul_v2s64 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]] - ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32) - ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]] - ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]] - ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; GFX10-LABEL: name: test_mul_v2s64 - ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) - ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) - ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]] - ; GFX10-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV4]], [[UV7]] - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[MUL]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV5]], [[UV6]] - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[ADD1]](s32) - ; GFX10-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; GFX10-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV10]](s32), [[UV12]], [[C]] - ; GFX10-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_2]](s64) - ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UV13]] - ; GFX10-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[MUL2]] - ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UV12]] - ; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[MUL3]] - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV14]](s32), [[ADD3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; + ; GFX89-LABEL: name: test_mul_v2s64 + ; GFX89: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX89-NEXT: {{ $}} + ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]] + ; GFX89-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV9]](s32) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV7]], [[ANYEXT]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV5]](s32), [[UV6]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX89-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) + ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[UV10]](s32) + ; GFX89-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GFX89-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV14]], [[C]] + ; GFX89-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) + ; GFX89-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV17]](s32) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV12]](s32), [[UV15]], [[ANYEXT1]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV13]](s32), [[UV14]], [[AMDGPU_MAD_U64_U32_8]] + ; GFX89-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) + ; GFX89-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV16]](s32), [[UV18]](s32) + ; GFX89-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; GFX89-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; + ; GFX1011-LABEL: name: test_mul_v2s64 + ; GFX1011: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX1011-NEXT: {{ $}} + ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) + ; GFX1011-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64) + ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV4]](s32), [[UV6]], [[C]] + ; GFX1011-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV4]], [[UV7]] + ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV9]], [[MUL]] + ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV5]], [[UV6]] + ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] + ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV8]](s32), [[ADD1]](s32) + ; GFX1011-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) + ; GFX1011-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV10]](s32), [[UV12]], [[C]] + ; GFX1011-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_2]](s64) + ; GFX1011-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UV13]] + ; GFX1011-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV15]], [[MUL2]] + ; GFX1011-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UV12]] + ; GFX1011-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[MUL3]] + ; GFX1011-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV14]](s32), [[ADD3]](s32) + ; GFX1011-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; GFX1011-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; + ; GFX12-LABEL: name: test_mul_v2s64 + ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UV]], [[UV2]] + ; GFX12-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[UV1]], [[UV3]] + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 %2:_(<2 x s64>) = G_MUL %0, %1 @@ -314,36 +241,17 @@ body: | ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]] ; GFX6-NEXT: $vgpr0 = COPY [[AND]](s32) - ; GFX8-LABEL: name: test_mul_s16 - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]] - ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16) - ; GFX8-NEXT: $vgpr0 = COPY [[ZEXT]](s32) - ; GFX9-LABEL: name: test_mul_s16 - ; GFX9: liveins: $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]] - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ZEXT]](s32) - ; GFX10-LABEL: name: test_mul_s16 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]] - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[ZEXT]](s32) + ; + ; GFX8PLUS-LABEL: name: test_mul_s16 + ; GFX8PLUS: liveins: $vgpr0, $vgpr1 + ; GFX8PLUS-NEXT: {{ $}} + ; GFX8PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX8PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX8PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX8PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8PLUS-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]] + ; GFX8PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[MUL]](s16) + ; GFX8PLUS-NEXT: $vgpr0 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -378,6 +286,7 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; ; GFX8-LABEL: name: test_mul_v2s16 ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8-NEXT: {{ $}} @@ -400,20 +309,14 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) - ; GFX9-LABEL: name: test_mul_v2s16 - ; GFX9: liveins: $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]] - ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>) - ; GFX10-LABEL: name: test_mul_v2s16 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]] - ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>) + ; + ; GFX9PLUS-LABEL: name: test_mul_v2s16 + ; GFX9PLUS: liveins: $vgpr0, $vgpr1 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[COPY]], [[COPY1]] + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[MUL]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_MUL %0, %1 @@ -441,6 +344,7 @@ body: | ; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY5]] ; GFX6-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[MUL2]](s32) ; GFX6-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16) + ; ; GFX8-LABEL: name: test_mul_v3s16 ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX8-NEXT: {{ $}} @@ -460,66 +364,37 @@ body: | ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[TRUNC1]], [[TRUNC4]] ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s16) = G_MUL [[TRUNC2]], [[TRUNC5]] ; GFX8-NEXT: S_ENDPGM 0, implicit [[MUL]](s16), implicit [[MUL1]](s16), implicit [[MUL2]](s16) - ; GFX9-LABEL: name: test_mul_v3s16 - ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) - ; GFX10-LABEL: name: test_mul_v3s16 - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) + ; + ; GFX9PLUS-LABEL: name: test_mul_v3s16 + ; GFX9PLUS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9PLUS-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9PLUS-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9PLUS-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9PLUS-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -578,6 +453,7 @@ body: | ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX8-LABEL: name: test_mul_v4s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} @@ -618,28 +494,18 @@ body: | ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-LABEL: name: test_mul_v4s16 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]] - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]] - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) - ; GFX10-LABEL: name: test_mul_v4s16 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]] - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX9PLUS-LABEL: name: test_mul_v4s16 + ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX9PLUS-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV]], [[UV2]] + ; GFX9PLUS-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[UV1]], [[UV3]] + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[MUL]](<2 x s16>), [[MUL1]](<2 x s16>) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_MUL %0, %1 @@ -652,34 +518,13 @@ body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: test_mul_s24 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX6-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX8-LABEL: name: test_mul_s24 - ; GFX8: liveins: $vgpr0, $vgpr1 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX9-LABEL: name: test_mul_s24 - ; GFX9: liveins: $vgpr0, $vgpr1 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX9-NEXT: $vgpr0 = COPY [[MUL]](s32) - ; GFX10-LABEL: name: test_mul_s24 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] - ; GFX10-NEXT: $vgpr0 = COPY [[MUL]](s32) + ; GCN-LABEL: name: test_mul_s24 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]] + ; GCN-NEXT: $vgpr0 = COPY [[MUL]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s24) = G_TRUNC %0 @@ -709,54 +554,48 @@ body: | ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX8-LABEL: name: test_mul_s33 - ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) - ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX9-LABEL: name: test_mul_s33 - ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; GFX10-LABEL: name: test_mul_s33 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] - ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX89-LABEL: name: test_mul_s33 + ; GFX89: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX89-NEXT: {{ $}} + ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX89-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] + ; GFX89-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX89-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV5]](s32) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[ANYEXT]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV2]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) + ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[UV6]](s32) + ; GFX89-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX1011-LABEL: name: test_mul_s33 + ; GFX1011: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX1011-NEXT: {{ $}} + ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX1011-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV2]], [[C]] + ; GFX1011-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV5]], [[MUL]] + ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL1]] + ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[ADD1]](s32) + ; GFX1011-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; + ; GFX12-LABEL: name: test_mul_s33 + ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]] + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s33) = G_TRUNC %0 @@ -800,67 +639,71 @@ body: | ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) - ; GFX8-LABEL: name: test_mul_s96 - ; GFX8: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 - ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) - ; GFX8-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] - ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]] - ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) - ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32) - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] - ; GFX8-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]] - ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) - ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) - ; GFX9-LABEL: name: test_mul_s96 - ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) - ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]] - ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32) - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] - ; GFX9-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]] - ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) - ; GFX10-LABEL: name: test_mul_s96 - ; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) - ; GFX10-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] - ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]] - ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]] - ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]] - ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32) - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] - ; GFX10-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]] - ; GFX10-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) + ; + ; GFX89-LABEL: name: test_mul_s96 + ; GFX89: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX89-NEXT: {{ $}} + ; GFX89-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX89-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX89-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX89-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) + ; GFX89-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] + ; GFX89-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV5]], [[C]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV4]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_6:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_7:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV2]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_4]] + ; GFX89-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_6]](s64) + ; GFX89-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[UV8]](s32) + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_8:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_9:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] + ; GFX89-NEXT: [[AMDGPU_MAD_U64_U32_10:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_11:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_8]] + ; GFX89-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_10]](s64) + ; GFX89-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV10]](s32), [[UV11]](s32) + ; GFX89-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) + ; + ; GFX1011-LABEL: name: test_mul_s96 + ; GFX1011: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX1011-NEXT: {{ $}} + ; GFX1011-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX1011-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX1011-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX1011-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) + ; GFX1011-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] + ; GFX1011-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX1011-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] + ; GFX1011-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]] + ; GFX1011-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]] + ; GFX1011-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]] + ; GFX1011-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]] + ; GFX1011-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32) + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] + ; GFX1011-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX1011-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) + ; GFX1011-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32) + ; GFX1011-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) + ; + ; GFX12-LABEL: name: test_mul_s96 + ; GFX12: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX12-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV3]], [[C]] + ; GFX12-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_]](s64) + ; GFX12-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] + ; GFX12-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV4]] + ; GFX12-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]] + ; GFX12-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV3]] + ; GFX12-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[MUL2]] + ; GFX12-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV7]](s32), [[ADD1]](s32) + ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_2:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_3:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV]](s32), [[UV4]], [[MV]] + ; GFX12-NEXT: [[AMDGPU_MAD_U64_U32_4:%[0-9]+]]:_(s64), [[AMDGPU_MAD_U64_U32_5:%[0-9]+]]:_(s1) = G_AMDGPU_MAD_U64_U32 [[UV1]](s32), [[UV3]], [[AMDGPU_MAD_U64_U32_2]] + ; GFX12-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_MAD_U64_U32_4]](s64) + ; GFX12-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UV6]](s32), [[UV8]](s32), [[UV9]](s32) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV1]](s96) %0:_(s96) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s96) = COPY $vgpr3_vgpr4_vgpr5 %2:_(s96) = G_MUL %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index eb3f74be71de0..0840f58ecd1a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX7-LABEL: s_mul_i16: @@ -31,6 +32,14 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -61,6 +70,12 @@ define i16 @v_mul_i16(i16 %num, i16 %den) { ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -95,6 +110,15 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i16_zeroext: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -125,6 +149,14 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i16_zeroext: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -159,6 +191,15 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i16_signext: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_and_b32 s1, s1, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: s_sext_i32_i16 s0, s0 +; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -193,6 +234,14 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i16_signext: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -207,6 +256,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } @@ -223,6 +277,12 @@ define i32 @v_mul_i32(i32 %num, i32 %den) { ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den ret i32 %result } @@ -239,6 +299,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_i32 s0, s0, s2 +; GFX12-NEXT: s_mul_i32 s1, s1, s3 +; GFX12-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -257,6 +323,13 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -308,6 +381,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i33: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: ; return to shader part epilog %result = mul i33 %num, %den ret i33 %result } @@ -359,6 +437,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } @@ -394,6 +477,17 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add3_u32 v1, v4, v3, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i64 %num, %den ret i64 %result } @@ -490,6 +584,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0 ; GFX10PLUS-NEXT: s_mov_b32 s0, s5 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i96: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_i32 s6, s0, s5 +; GFX12-NEXT: s_mul_i32 s7, s1, s4 +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_add_co_i32 s6, s6, s7 +; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX12-NEXT: s_add_co_i32 s6, s6, s2 +; GFX12-NEXT: s_mul_i32 s2, s0, s4 +; GFX12-NEXT: s_mul_i32 s5, s0, s3 +; GFX12-NEXT: s_mul_hi_u32 s0, s0, s4 +; GFX12-NEXT: s_add_co_u32 s2, s2, s7 +; GFX12-NEXT: s_mul_i32 s4, s1, s3 +; GFX12-NEXT: s_add_co_ci_u32 s0, s0, s6 +; GFX12-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX12-NEXT: s_add_co_u32 s1, s4, s2 +; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0 +; GFX12-NEXT: s_mov_b32 s0, s5 +; GFX12-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast @@ -536,6 +650,22 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i96: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX12-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v5, v6, v5 +; GFX12-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v2, v5, v8, v2 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den ret i96 %result } @@ -709,6 +839,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; GFX10PLUS-NEXT: s_mov_b32 s1, s8 ; GFX10PLUS-NEXT: s_mov_b32 s2, s7 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i128: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_i32 s9, s0, s6 +; GFX12-NEXT: s_mul_i32 s11, s1, s5 +; GFX12-NEXT: s_mul_hi_u32 s10, s0, s6 +; GFX12-NEXT: s_mul_hi_u32 s12, s1, s5 +; GFX12-NEXT: s_add_co_u32 s9, s11, s9 +; GFX12-NEXT: s_mul_i32 s11, s2, s4 +; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX12-NEXT: s_mul_hi_u32 s12, s2, s4 +; GFX12-NEXT: s_mul_hi_u32 s8, s0, s4 +; GFX12-NEXT: s_add_co_u32 s9, s11, s9 +; GFX12-NEXT: s_mul_i32 s11, s0, s5 +; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX12-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX12-NEXT: s_add_co_u32 s8, s11, s8 +; GFX12-NEXT: s_add_co_ci_u32 s9, s12, s9 +; GFX12-NEXT: s_mul_i32 s12, s1, s4 +; GFX12-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX12-NEXT: s_cselect_b32 s11, 1, 0 +; GFX12-NEXT: s_add_co_u32 s8, s12, s8 +; GFX12-NEXT: s_mul_i32 s12, s0, s7 +; GFX12-NEXT: s_add_co_ci_u32 s7, s13, s9 +; GFX12-NEXT: s_add_co_ci_u32 s9, s10, s12 +; GFX12-NEXT: s_mul_i32 s1, s1, s6 +; GFX12-NEXT: s_cmp_lg_u32 s11, 0 +; GFX12-NEXT: s_mul_i32 s2, s2, s5 +; GFX12-NEXT: s_add_co_ci_u32 s1, s9, s1 +; GFX12-NEXT: s_mul_i32 s3, s3, s4 +; GFX12-NEXT: s_add_co_i32 s1, s1, s2 +; GFX12-NEXT: s_mul_i32 s0, s0, s4 +; GFX12-NEXT: s_add_co_i32 s3, s1, s3 +; GFX12-NEXT: s_mov_b32 s1, s8 +; GFX12-NEXT: s_mov_b32 s2, s7 +; GFX12-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast @@ -820,6 +986,32 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo ; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i128: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 +; GFX12-NEXT: v_mov_b32_e32 v10, v2 +; GFX12-NEXT: v_mul_lo_u32 v3, v3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 +; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mul_lo_u32 v5, v10, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v3, v4, v5, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result } @@ -1625,6 +1817,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 ; GFX10PLUS-NEXT: s_mov_b32 s1, s16 ; GFX10PLUS-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: s_mul_i256: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mul_i32 s17, s0, s10 +; GFX12-NEXT: s_mul_i32 s19, s1, s9 +; GFX12-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX12-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX12-NEXT: s_add_co_u32 s17, s19, s17 +; GFX12-NEXT: s_add_co_ci_u32 s18, s20, s18 +; GFX12-NEXT: s_mul_i32 s20, s2, s8 +; GFX12-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX12-NEXT: s_cselect_b32 s19, 1, 0 +; GFX12-NEXT: s_add_co_u32 s17, s20, s17 +; GFX12-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX12-NEXT: s_mul_i32 s21, s0, s9 +; GFX12-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX12-NEXT: s_cselect_b32 s20, 1, 0 +; GFX12-NEXT: s_add_co_u32 s16, s21, s16 +; GFX12-NEXT: s_add_co_ci_u32 s17, s22, s17 +; GFX12-NEXT: s_mul_i32 s22, s1, s8 +; GFX12-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX12-NEXT: s_cselect_b32 s21, 1, 0 +; GFX12-NEXT: s_add_co_u32 s16, s22, s16 +; GFX12-NEXT: s_add_co_ci_u32 s17, s23, s17 +; GFX12-NEXT: s_mul_i32 s23, s0, s12 +; GFX12-NEXT: s_mul_i32 s25, s1, s11 +; GFX12-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX12-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX12-NEXT: s_cselect_b32 s22, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s25, s23 +; GFX12-NEXT: s_add_co_ci_u32 s24, s26, s24 +; GFX12-NEXT: s_mul_i32 s26, s2, s10 +; GFX12-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX12-NEXT: s_cselect_b32 s25, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s26, s23 +; GFX12-NEXT: s_add_co_ci_u32 s24, s27, s24 +; GFX12-NEXT: s_mul_i32 s27, s3, s9 +; GFX12-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX12-NEXT: s_cselect_b32 s26, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s27, s23 +; GFX12-NEXT: s_add_co_ci_u32 s24, s28, s24 +; GFX12-NEXT: s_mul_i32 s28, s4, s8 +; GFX12-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX12-NEXT: s_cselect_b32 s27, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s28, s23 +; GFX12-NEXT: s_add_co_ci_u32 s24, s29, s24 +; GFX12-NEXT: s_mul_i32 s29, s0, s11 +; GFX12-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX12-NEXT: s_cselect_b32 s28, 1, 0 +; GFX12-NEXT: s_add_co_u32 s18, s29, s18 +; GFX12-NEXT: s_add_co_ci_u32 s23, s30, s23 +; GFX12-NEXT: s_mul_i32 s30, s1, s10 +; GFX12-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX12-NEXT: s_cselect_b32 s29, 1, 0 +; GFX12-NEXT: s_add_co_u32 s18, s30, s18 +; GFX12-NEXT: s_add_co_ci_u32 s23, s31, s23 +; GFX12-NEXT: s_mul_i32 s31, s2, s9 +; GFX12-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX12-NEXT: s_cselect_b32 s30, 1, 0 +; GFX12-NEXT: s_add_co_u32 s18, s31, s18 +; GFX12-NEXT: s_add_co_ci_u32 s23, s33, s23 +; GFX12-NEXT: s_mul_i32 s33, s3, s8 +; GFX12-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX12-NEXT: s_cselect_b32 s31, 1, 0 +; GFX12-NEXT: s_add_co_u32 s18, s33, s18 +; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23 +; GFX12-NEXT: s_cselect_b32 s33, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s22, 0 +; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX12-NEXT: s_cselect_b32 s21, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0 +; GFX12-NEXT: s_cmp_lg_u32 s21, 0 +; GFX12-NEXT: s_mul_i32 s21, s0, s14 +; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23 +; GFX12-NEXT: s_mul_i32 s23, s1, s13 +; GFX12-NEXT: s_cselect_b32 s20, 1, 0 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s2, s12 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s3, s11 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s4, s10 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s5, s9 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s6, s8 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX12-NEXT: s_add_co_u32 s21, s23, s21 +; GFX12-NEXT: s_mul_i32 s23, s0, s13 +; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX12-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX12-NEXT: s_add_co_u32 s23, s23, s24 +; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21 +; GFX12-NEXT: s_mul_i32 s34, s1, s12 +; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX12-NEXT: s_cselect_b32 s24, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s34, s23 +; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21 +; GFX12-NEXT: s_mul_i32 s35, s2, s11 +; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX12-NEXT: s_cselect_b32 s34, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s35, s23 +; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21 +; GFX12-NEXT: s_mul_i32 s36, s3, s10 +; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX12-NEXT: s_cselect_b32 s35, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s36, s23 +; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21 +; GFX12-NEXT: s_mul_i32 s37, s4, s9 +; GFX12-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX12-NEXT: s_cselect_b32 s36, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s37, s23 +; GFX12-NEXT: s_add_co_ci_u32 s21, s38, s21 +; GFX12-NEXT: s_mul_i32 s38, s5, s8 +; GFX12-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX12-NEXT: s_cselect_b32 s37, 1, 0 +; GFX12-NEXT: s_add_co_u32 s23, s38, s23 +; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21 +; GFX12-NEXT: s_cselect_b32 s38, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s30, 0 +; GFX12-NEXT: s_mul_i32 s1, s1, s14 +; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX12-NEXT: s_cmp_lg_u32 s31, 0 +; GFX12-NEXT: s_mul_i32 s2, s2, s13 +; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX12-NEXT: s_cmp_lg_u32 s33, 0 +; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_i32 s4, s4, s11 +; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23 +; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s26, 0 +; GFX12-NEXT: s_mul_i32 s26, s0, s15 +; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX12-NEXT: s_cmp_lg_u32 s27, 0 +; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX12-NEXT: s_cmp_lg_u32 s28, 0 +; GFX12-NEXT: s_mul_i32 s6, s6, s9 +; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX12-NEXT: s_cmp_lg_u32 s23, 0 +; GFX12-NEXT: s_mul_i32 s7, s7, s8 +; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21 +; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26 +; GFX12-NEXT: s_cmp_lg_u32 s38, 0 +; GFX12-NEXT: s_mul_i32 s0, s0, s8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1 +; GFX12-NEXT: s_cmp_lg_u32 s37, 0 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2 +; GFX12-NEXT: s_cmp_lg_u32 s36, 0 +; GFX12-NEXT: s_mov_b32 s2, s17 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GFX12-NEXT: s_cmp_lg_u32 s35, 0 +; GFX12-NEXT: s_mov_b32 s3, s18 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s4 +; GFX12-NEXT: s_cmp_lg_u32 s34, 0 +; GFX12-NEXT: s_mov_b32 s4, s19 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s5 +; GFX12-NEXT: s_cmp_lg_u32 s24, 0 +; GFX12-NEXT: s_mov_b32 s5, s20 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6 +; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_add_co_i32 s7, s1, s7 +; GFX12-NEXT: s_mov_b32 s1, s16 +; GFX12-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast @@ -1978,6 +2349,454 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_mul_i256: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 +; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] +; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v19, v22 +; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: v_mov_b32_e32 v20, v18 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] +; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 +; GFX12-NEXT: v_mov_b32_e32 v13, v1 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, v21 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v7, v8, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result } + +define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX7-LABEL: s_mul_u64_zext_with_vregs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 +; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: s_mul_u64_zext_with_vregs: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_mul_u64_zext_with_vregs: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_mul_u64_zext_with_vregs: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_mul_u64_zext_with_vregs: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_mul_u64_zext_with_vregs: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = zext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX7-LABEL: s_mul_u64_zext_with_sregs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX7-NEXT: s_mul_i32 s4, s3, 0x50 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_readfirstlane_b32 s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: s_mul_u64_zext_with_sregs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_mul_u64_zext_with_sregs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_mul_u64_zext_with_sregs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_mul_u64_zext_with_sregs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_mul_u64_zext_with_sregs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = zext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX7-LABEL: s_mul_u64_sext_with_vregs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: s_mul_u64_sext_with_vregs: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v4, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_mul_u64_sext_with_vregs: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_mul_u64_sext_with_vregs: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: v_mul_lo_u32 v4, 0x50, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_mul_u64_sext_with_vregs: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v4, 0x50, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_mul_u64_sext_with_vregs: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX7-LABEL: s_mul_u64_sext_with_sregs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX7-NEXT: s_ashr_i32 s5, s3, 31 +; GFX7-NEXT: s_mul_i32 s4, s3, 0x50 +; GFX7-NEXT: s_mulk_i32 s5, 0x50 +; GFX7-NEXT: v_readfirstlane_b32 s3, v0 +; GFX7-NEXT: s_add_u32 s5, s5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: s_mul_u64_sext_with_sregs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: s_ashr_i32 s3, s2, 31 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 +; GFX8-NEXT: s_mulk_i32 s3, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_add_u32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_mul_u64_sext_with_sregs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: s_add_u32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_mul_u64_sext_with_sregs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50 +; GFX10-NEXT: s_mulk_i32 s3, 0x50 +; GFX10-NEXT: s_mulk_i32 s2, 0x50 +; GFX10-NEXT: s_add_i32 s3, s4, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_mul_u64_sext_with_sregs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_ashr_i32 s3, s2, 31 +; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50 +; GFX11-NEXT: s_mulk_i32 s3, 0x50 +; GFX11-NEXT: s_mulk_i32 s2, 0x50 +; GFX11-NEXT: s_add_i32 s3, s4, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_mul_u64_sext_with_sregs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir new file mode 100644 index 0000000000000..f74a575ac931e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-mul.mir @@ -0,0 +1,60 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: mul_s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: mul_s64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[C]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MUL]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 12345 + %2:_(s64) = G_MUL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: mul_s64_zext +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: mul_s64_zext + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345 + ; CHECK-NEXT: [[AMDGPU_:%[0-9]+]]:_(s64) = G_AMDGPU_S_MUL_U64_U32 [[ZEXT]], [[C]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ZEXT %0 + %2:_(s64) = G_CONSTANT i64 12345 + %3:_(s64) = G_MUL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: mul_s64_sext +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: mul_s64_sext + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12345 + ; CHECK-NEXT: [[AMDGPU_:%[0-9]+]]:_(s64) = G_AMDGPU_S_MUL_I64_I32 [[SEXT]], [[C]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_SEXT %0 + %2:_(s64) = G_CONSTANT i64 12345 + %3:_(s64) = G_MUL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir index a5b61641e0c26..a6cc6c92d9f8a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mul.mir @@ -74,3 +74,125 @@ body: | %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_MUL %0, %1 ... + +--- +name: mul_s64_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-LABEL: name: mul_s64_ss + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:sgpr(s64) = G_MUL [[COPY]], [[COPY1]] + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $sgpr2_sgpr3 + %2:_(s64) = G_MUL %0, %1 +... + +--- +name: mul_s64_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: mul_s64_vv + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:vgpr(s32) = G_UMULH [[UV]], [[UV2]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[UV]], [[UV3]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UMULH]], [[MUL]] + ; CHECK-NEXT: [[MUL1:%[0-9]+]]:vgpr(s32) = G_MUL [[UV1]], [[UV2]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[MUL1]] + ; CHECK-NEXT: [[MUL2:%[0-9]+]]:vgpr(s32) = G_MUL [[UV]], [[UV2]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[MUL2]](s32), [[ADD1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_MUL %0, %1 +... + +--- +name: mul_s64_zext_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-LABEL: name: mul_s64_zext_ss + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[S_MUL_U64_:%[0-9]+]]:sgpr_64(s64) = S_MUL_U64 [[COPY]](s64), [[COPY1]](s64) + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $sgpr2_sgpr3 + %2:_(s64) = G_AMDGPU_S_MUL_U64_U32 %0, %1 +... + +--- +name: mul_s64_zext_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: mul_s64_zext_vv + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:vreg_64(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[AMDGPU_MAD_U64_U32_:%[0-9]+]]:vgpr(s64), [[AMDGPU_MAD_U64_U32_1:%[0-9]+]]:vreg_64 = G_AMDGPU_MAD_U64_U32 [[TRUNC]](s32), [[TRUNC1]], [[C]] + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_AMDGPU_S_MUL_U64_U32 %0, %1 +... + +--- +name: mul_s64_sext_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-LABEL: name: mul_s64_sext_ss + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[S_MUL_U64_:%[0-9]+]]:sgpr_64(s64) = S_MUL_U64 [[COPY]](s64), [[COPY1]](s64) + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = COPY $sgpr2_sgpr3 + %2:_(s64) = G_AMDGPU_S_MUL_I64_I32 %0, %1 +... + +--- +name: mul_s64_sext_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-LABEL: name: mul_s64_sext_vv + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:vgpr_32(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:vreg_64(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[AMDGPU_MAD_I64_I32_:%[0-9]+]]:vgpr(s64), [[AMDGPU_MAD_I64_I32_1:%[0-9]+]]:vreg_64 = G_AMDGPU_MAD_I64_I32 [[TRUNC]](s32), [[TRUNC1]], [[C]] + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = G_AMDGPU_S_MUL_I64_I32 %0, %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 26d981ad7b4bf..b4c8da44337ae 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1259,20 +1259,21 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB3_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: s_mul_i32 s6, s6, 5 +; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -1296,19 +1297,20 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_i32 s5, s5, 5 +; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -1316,7 +1318,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_waitcnt vmcnt(0) ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 @@ -1643,23 +1645,21 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB4_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) -; GFX1264-NEXT: s_mul_i32 s9, s1, s8 -; GFX1264-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1264-NEXT: s_mul_i32 s8, s0, s8 -; GFX1264-NEXT: s_add_co_i32 s10, s10, s9 +; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s10 +; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 @@ -1687,31 +1687,28 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_clause 0x1 ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s3, exec_lo ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) -; GFX1232-NEXT: s_mul_i32 s8, s1, s3 -; GFX1232-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1232-NEXT: s_mul_i32 s3, s0, s3 -; GFX1232-NEXT: s_add_co_i32 s9, s9, s8 -; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1232-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1232-NEXT: s_mov_b32 s14, -1 +; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1232-NEXT: s_mov_b32 s12, s6 +; GFX1232-NEXT: s_mov_b32 s13, s7 +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_waitcnt vmcnt(0) ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) @@ -3182,20 +3179,21 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB9_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: s_mul_i32 s6, s6, 5 +; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 ; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -3222,19 +3220,20 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_i32 s5, s5, 5 +; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -3242,7 +3241,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_waitcnt vmcnt(0) ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3585,23 +3584,21 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB10_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) -; GFX1264-NEXT: s_mul_i32 s9, s1, s8 -; GFX1264-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1264-NEXT: s_mul_i32 s8, s0, s8 -; GFX1264-NEXT: s_add_co_i32 s10, s10, s9 +; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 -; GFX1264-NEXT: v_mov_b32_e32 v1, s10 +; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 @@ -3632,31 +3629,28 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_clause 0x1 ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX1232-NEXT: s_mov_b32 s3, exec_lo ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) -; GFX1232-NEXT: s_mul_i32 s8, s1, s3 -; GFX1232-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1232-NEXT: s_mul_i32 s3, s0, s3 -; GFX1232-NEXT: s_add_co_i32 s9, s9, s8 -; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1232-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1232-NEXT: s_mov_b32 s14, -1 +; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1232-NEXT: s_mov_b32 s12, s6 +; GFX1232-NEXT: s_mov_b32 s13, s7 +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_waitcnt vmcnt(0) ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) ; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 5e90c33f3c8cb..e2617fc453b58 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s ; mul24 and mad24 are affected @@ -106,6 +107,27 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: test_mul_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: test_mul_v2i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -247,6 +269,31 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul_v4i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -351,6 +398,21 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: s_trunc_i64_mul_to_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mul_i32 s0, s0, s6 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: s_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -483,6 +545,31 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_trunc_i64_mul_to_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s2, s10 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null +; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null +; GFX12-NEXT: s_mov_b32 s8, s4 +; GFX12-NEXT: s_mov_b32 s9, s5 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -587,6 +674,21 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: mul64_sext_c: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_ashr_i32 s3, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -606,6 +708,113 @@ entry: ret void } +define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: mul64_zext_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: v_mov_b32_e32 v0, 0x50 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_hi_u32 v1, s4, v0 +; SI-NEXT: s_mulk_i32 s4, 0x50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mul64_zext_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x50 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_nop 2 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul64_zext_c: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul64_zext_c: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: mul64_zext_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: mul64_zext_c: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; EG-LABEL: mul64_zext_c: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MULHI * T0.Y, KC0[2].Z, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) +entry: + %0 = zext i32 %in to i64 + %1 = mul i64 %0, 80 + store i64 %1, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry @@ -706,6 +915,27 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul64_sext_c: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 +; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -732,6 +962,153 @@ entry: ret void } +define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_mul64_zext_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_movk_i32 s2, 0x50 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_hi_u32 v1, v0, s2 +; SI-NEXT: v_mul_lo_u32 v0, v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_mul64_zext_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_movk_i32 s2, 0x50 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_mul64_zext_c: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_movk_i32 s2, 0x50 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_mul64_zext_c: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_mul64_zext_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_mul64_zext_c: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; EG-LABEL: v_mul64_zext_c: +; EG: ; %bb.0: ; %entry +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MULHI * T0.Y, T0.X, literal.x, +; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, +; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) +entry: + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = zext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void +} + define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry @@ -829,6 +1206,27 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul64_sext_inline_imm: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 +; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_inline_imm: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -925,6 +1323,22 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: s_mul_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1034,6 +1448,26 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1133,6 +1567,23 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: s_mul_i1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[], KC1[] @@ -1272,6 +1723,30 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul_i1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null +; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -1418,6 +1893,21 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: s_mul_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] @@ -1579,6 +2069,37 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s2, s10 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null +; GFX12-NEXT: s_mov_b32 s8, s4 +; GFX12-NEXT: s_mov_b32 s9, s5 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1616,30 +2137,30 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_cbranch_scc0 .LBB13_2 +; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mul_i32 s6, s2, s3 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB13_3 -; SI-NEXT: .LBB13_2: +; SI-NEXT: s_branch .LBB15_3 +; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: .LBB13_3: ; %Flow +; SI-NEXT: .LBB15_3: ; %Flow ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB13_5 +; SI-NEXT: s_cbranch_vccnz .LBB15_5 ; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: s_branch .LBB13_6 -; SI-NEXT: .LBB13_5: +; SI-NEXT: s_branch .LBB15_6 +; SI-NEXT: .LBB15_5: ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: .LBB13_6: ; %endif +; SI-NEXT: .LBB15_6: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1651,18 +2172,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_cbranch_scc0 .LBB13_2 +; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mul_i32 s6, s2, s3 ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: s_branch .LBB13_3 -; VI-NEXT: .LBB13_2: +; VI-NEXT: s_branch .LBB15_3 +; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: .LBB13_3: ; %Flow +; VI-NEXT: .LBB15_3: ; %Flow ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: s_cbranch_vccnz .LBB13_5 +; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1670,10 +2191,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_branch .LBB13_6 -; VI-NEXT: .LBB13_5: +; VI-NEXT: s_branch .LBB15_6 +; VI-NEXT: .LBB15_5: ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: .LBB13_6: ; %endif +; VI-NEXT: .LBB15_6: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1686,18 +2207,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: s_mul_i32 s6, s2, s3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_branch .LBB13_3 -; GFX9-NEXT: .LBB13_2: +; GFX9-NEXT: s_branch .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 -; GFX9-NEXT: .LBB13_3: ; %Flow +; GFX9-NEXT: .LBB15_3: ; %Flow ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX9-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1705,10 +2226,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GFX9-NEXT: s_branch .LBB13_6 -; GFX9-NEXT: .LBB13_5: +; GFX9-NEXT: s_branch .LBB15_6 +; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: .LBB13_6: ; %endif +; GFX9-NEXT: .LBB15_6: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -1722,17 +2243,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_mul_i32 s5, s2, s3 -; GFX10-NEXT: s_branch .LBB13_3 -; GFX10-NEXT: .LBB13_2: +; GFX10-NEXT: s_branch .LBB15_3 +; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 -; GFX10-NEXT: .LBB13_3: ; %Flow +; GFX10-NEXT: .LBB15_3: ; %Flow ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1740,10 +2261,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: s_mov_b32 s5, s3 ; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GFX10-NEXT: s_branch .LBB13_6 -; GFX10-NEXT: .LBB13_5: +; GFX10-NEXT: s_branch .LBB15_6 +; GFX10-NEXT: .LBB15_5: ; GFX10-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-NEXT: .LBB13_6: ; %endif +; GFX10-NEXT: .LBB15_6: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -1757,17 +2278,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_mul_i32 s5, s2, s3 -; GFX11-NEXT: s_branch .LBB13_3 -; GFX11-NEXT: .LBB13_2: +; GFX11-NEXT: s_branch .LBB15_3 +; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 -; GFX11-NEXT: .LBB13_3: ; %Flow +; GFX11-NEXT: .LBB15_3: ; %Flow ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 @@ -1775,10 +2296,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_mov_b32 s4, s2 ; GFX11-NEXT: s_mov_b32 s5, s3 ; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_branch .LBB13_6 -; GFX11-NEXT: .LBB13_5: +; GFX11-NEXT: s_branch .LBB15_6 +; GFX11-NEXT: .LBB15_5: ; GFX11-NEXT: v_mov_b32_e32 v0, s5 -; GFX11-NEXT: .LBB13_6: ; %endif +; GFX11-NEXT: .LBB15_6: ; %endif ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -1788,6 +2309,43 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: mul32_in_branch: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX12-NEXT: ; %bb.1: ; %else +; GFX12-NEXT: s_mul_i32 s5, s2, s3 +; GFX12-NEXT: s_branch .LBB15_3 +; GFX12-NEXT: .LBB15_2: +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: ; implicit-def: $sgpr5 +; GFX12-NEXT: .LBB15_3: ; %Flow +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX12-NEXT: ; %bb.4: ; %if +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s4, s2 +; GFX12-NEXT: s_mov_b32 s5, s3 +; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_branch .LBB15_6 +; GFX12-NEXT: .LBB15_5: +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: .LBB15_6: ; %endif +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: mul32_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] @@ -1850,7 +2408,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 ; SI-NEXT: s_and_b64 vcc, exec, s[10:11] -; SI-NEXT: s_cbranch_vccz .LBB14_4 +; SI-NEXT: s_cbranch_vccz .LBB16_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -1861,22 +2419,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz .LBB14_3 -; SI-NEXT: .LBB14_2: ; %if +; SI-NEXT: s_cbranch_vccnz .LBB16_3 +; SI-NEXT: .LBB16_2: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: .LBB14_3: ; %endif +; SI-NEXT: .LBB16_3: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB14_4: +; SI-NEXT: .LBB16_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB14_2 +; SI-NEXT: s_branch .LBB16_2 ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry @@ -1884,7 +2442,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-NEXT: s_cbranch_scc0 .LBB14_4 +; VI-NEXT: s_cbranch_scc0 .LBB16_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 @@ -1893,22 +2451,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_mul_i32 s4, s5, s6 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VI-NEXT: s_cbranch_vccnz .LBB14_3 -; VI-NEXT: .LBB14_2: ; %if +; VI-NEXT: s_cbranch_vccnz .LBB16_3 +; VI-NEXT: .LBB16_2: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; VI-NEXT: .LBB14_3: ; %endif +; VI-NEXT: .LBB16_3: ; %endif ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB14_4: +; VI-NEXT: .LBB16_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB14_2 +; VI-NEXT: s_branch .LBB16_2 ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry @@ -1916,7 +2474,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB14_3 +; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: s_mul_i32 s7, s4, s7 ; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 @@ -1925,21 +2483,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_add_i32 s5, s7, s5 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX9-NEXT: s_cbranch_vccnz .LBB14_4 -; GFX9-NEXT: .LBB14_2: ; %if +; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 +; GFX9-NEXT: .LBB16_2: ; %if ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-NEXT: s_branch .LBB14_5 -; GFX9-NEXT: .LBB14_3: +; GFX9-NEXT: s_branch .LBB16_5 +; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB14_2 -; GFX9-NEXT: .LBB14_4: +; GFX9-NEXT: s_branch .LBB16_2 +; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: .LBB14_5: ; %endif +; GFX9-NEXT: .LBB16_5: ; %endif ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1951,7 +2509,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB14_3 +; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_mul_i32 s7, s4, s7 ; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 @@ -1960,22 +2518,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: s_mul_i32 s4, s4, s6 ; GFX10-NEXT: s_add_i32 s5, s7, s5 ; GFX10-NEXT: s_mov_b32 s6, 0 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: .LBB14_2: ; %if +; GFX10-NEXT: s_cbranch_execnz .LBB16_4 +; GFX10-NEXT: .LBB16_2: ; %if ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: s_mov_b32 s5, s3 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_branch .LBB14_5 -; GFX10-NEXT: .LBB14_3: +; GFX10-NEXT: s_branch .LBB16_5 +; GFX10-NEXT: .LBB16_3: ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX10-NEXT: s_branch .LBB14_2 -; GFX10-NEXT: .LBB14_4: +; GFX10-NEXT: s_branch .LBB16_2 +; GFX10-NEXT: .LBB16_4: ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: .LBB14_5: ; %endif +; GFX10-NEXT: .LBB16_5: ; %endif ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1987,7 +2545,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB14_3 +; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_mul_i32 s7, s4, s7 ; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 @@ -1996,21 +2554,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_mul_i32 s4, s4, s6 ; GFX11-NEXT: s_add_i32 s5, s7, s5 ; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: .LBB14_2: ; %if +; GFX11-NEXT: s_cbranch_execnz .LBB16_4 +; GFX11-NEXT: .LBB16_2: ; %if ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s4, s2 ; GFX11-NEXT: s_mov_b32 s5, s3 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_branch .LBB14_5 -; GFX11-NEXT: .LBB14_3: +; GFX11-NEXT: s_branch .LBB16_5 +; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB14_2 -; GFX11-NEXT: .LBB14_4: +; GFX11-NEXT: s_branch .LBB16_2 +; GFX11-NEXT: .LBB16_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: .LBB14_5: ; %endif +; GFX11-NEXT: .LBB16_5: ; %endif ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2019,6 +2577,38 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: mul64_in_branch: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX12-NEXT: ; %bb.1: ; %else +; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] +; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_4 +; GFX12-NEXT: .LBB16_2: ; %if +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s4, s2 +; GFX12-NEXT: s_mov_b32 s5, s3 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_branch .LBB16_5 +; GFX12-NEXT: .LBB16_3: +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX12-NEXT: s_branch .LBB16_2 +; GFX12-NEXT: .LBB16_4: +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: .LBB16_5: ; %endif +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: mul64_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] @@ -2324,6 +2914,51 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: s_mul_i128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s24, s3 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s2, s4 +; GFX12-NEXT: s_mov_b32 s14, s8 +; GFX12-NEXT: s_mov_b32 s12, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] +; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s23 +; GFX12-NEXT: s_mov_b32 s16, s5 +; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] +; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] +; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] +; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] +; GFX12-NEXT: s_mov_b32 s18, s7 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] +; GFX12-NEXT: s_mov_b32 s25, s6 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] +; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] @@ -2570,6 +3205,44 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; +; GFX12-LABEL: v_mul_i128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c +; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[0:3], v15, s[0:1] +; GFX12-NEXT: global_load_b128 v[4:7], v15, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 +; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX12-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] +; GFX12-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v4, v2, 0 +; GFX12-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, v10 +; GFX12-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add3_u32 v12, v12, v3, v14 +; GFX12-NEXT: v_add_co_u32 v2, s0, v13, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v6, v0, v[11:12] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v1, v5, v[2:3] +; GFX12-NEXT: v_add3_u32 v0, v10, v14, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 +; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo +; GFX12-NEXT: global_store_b128 v15, v[8:11], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] @@ -2672,6 +3345,12 @@ define i32 @mul_pow2_plus_1(i32 %val) { ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: mul_pow2_plus_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; EG-LABEL: mul_pow2_plus_1: ; EG: ; %bb.0: ; EG-NEXT: CF_END