From 556f7ff7836e4d884c64bc87bcef80d1687ccf86 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 7 Mar 2025 12:27:45 +0800 Subject: [PATCH 01/20] Implement vop3p complex pattern optmization for gisel --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 381 ++++++++++++++++-- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 +- .../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 3 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 24 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 6 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 36 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll | 12 +- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 10 +- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 11 +- llvm/test/lit.cfg.py | 2 +- 12 files changed, 400 insertions(+), 113 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 441fb5730a6d8..0dc47b957bdac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4282,30 +4282,346 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { }}; } -std::pair -AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { - unsigned Mods = 0; - MachineInstr *MI = MRI.getVRegDef(Src); +enum srcStatus { + IS_SAME, + IS_UPPER_HALF, + IS_LOWER_HALF, + IS_NEG, + IS_UPPER_HALF_NEG, + IS_LOWER_HALF_NEG, + LAST_STAT = IS_LOWER_HALF_NEG +}; + +bool isTruncHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { + assert(MI->getOpcode() == AMDGPU::G_TRUNC); + unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); + unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + return dstSize * 2 == srcSize; +} + +bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { + assert(MI->getOpcode() == AMDGPU::G_LSHR); + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(MI->getOperand(0).getReg(), MRI, + m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { + unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned shift = ShiftAmt->Value.getZExtValue(); + return shift * 2 == srcSize; + } + return false; +} - if (MI->getOpcode() == AMDGPU::G_FNEG && - // It's possible to see an f32 fneg here, but unlikely. - // TODO: Treat f32 fneg as only high bit. - MRI.getType(Src) == LLT::fixed_vector(2, 16)) { - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - Src = MI->getOperand(1).getReg(); - MI = MRI.getVRegDef(Src); +bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { + assert(MI->getOpcode() == AMDGPU::G_SHL); + Register ShiftSrc; + std::optional ShiftAmt; + if (mi_match(MI->getOperand(0).getReg(), MRI, + m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { + unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned shift = ShiftAmt->Value.getZExtValue(); + return shift * 2 == srcSize; + } + return false; +} + +bool retOpStat(MachineOperand *Op, int stat, + std::pair &curr) { + if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || + Op->isCImm() || Op->isFPImm()) { + curr = {Op, stat}; + return true; + } + return false; +} + +bool calcNextStatus(std::pair &curr, + const MachineRegisterInfo &MRI) { + if (!curr.first->isReg()) { + return false; + } + MachineInstr *MI = nullptr; + + if (!curr.first->isDef()) { + // MRI.getVRegDef falls into infinite loop if use define reg + MI = MRI.getVRegDef(curr.first->getReg()); + } else { + MI = curr.first->getParent(); + } + if (!MI) { + return false; + } + + unsigned Opc = MI->getOpcode(); + + // Handle general Opc cases + switch (Opc) { + case AMDGPU::G_BITCAST: + case AMDGPU::G_CONSTANT: + case AMDGPU::G_FCONSTANT: + case AMDGPU::COPY: + return retOpStat(&MI->getOperand(1), curr.second, curr); + case AMDGPU::G_FNEG: + // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX + return retOpStat(&MI->getOperand(1), + (curr.second + ((LAST_STAT + 1) / 2)) % (LAST_STAT + 1), + curr); + } + + // Calc next stat from current stat + switch (curr.second) { + case IS_SAME: + switch (Opc) { + case AMDGPU::G_TRUNC: { + if (isTruncHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); + } + break; + } + } + break; + case IS_NEG: + switch (Opc) { + case AMDGPU::G_TRUNC: { + if (isTruncHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); + } + break; + } + } + break; + case IS_UPPER_HALF: + switch (Opc) { + case AMDGPU::G_SHL: { + if (isShlHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); + } + break; + } + } + break; + case IS_LOWER_HALF: + switch (Opc) { + case AMDGPU::G_LSHR: { + if (isLshrHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr); + } + break; + } + } + break; + case IS_UPPER_HALF_NEG: + switch (Opc) { + case AMDGPU::G_SHL: { + if (isShlHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); + } + break; + } + } + break; + case IS_LOWER_HALF_NEG: + switch (Opc) { + case AMDGPU::G_LSHR: { + if (isLshrHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr); + } + break; + } + } + break; + } + return false; +} + +std::vector> +getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI, + bool onlyLastSameOrNeg = false, int maxDepth = 6) { + int depth = 0; + std::pair curr = {Op, IS_SAME}; + std::vector> statList; + + while (true) { + depth++; + if (depth > maxDepth) { + break; + } + bool ret = calcNextStatus(curr, MRI); + if (!ret || (onlyLastSameOrNeg && + (curr.second != IS_SAME && curr.second != IS_NEG))) { + break; + } else if (!onlyLastSameOrNeg) { + statList.push_back(curr); + } } + if (onlyLastSameOrNeg) { + statList.push_back(curr); + } + return statList; +} - // TODO: Handle G_FSUB 0 as fneg +bool isInlinableConstant(MachineOperand *Op, const SIInstrInfo &TII) { + bool a = TII.isInlineConstant(*Op); + switch (Op->getType()) { + case MachineOperand::MachineOperandType::MO_Immediate: + return TII.isInlineConstant(*Op); + case MachineOperand::MachineOperandType::MO_CImmediate: + return TII.isInlineConstant(Op->getCImm()->getValue()); + case MachineOperand::MachineOperandType::MO_FPImmediate: + return TII.isInlineConstant(Op->getFPImm()->getValueAPF()); + } + return false; +} - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. - (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() +bool isSameBitWidth(MachineOperand *Op1, MachineOperand *Op2, + const MachineRegisterInfo &MRI) { + unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits(); + unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits(); + return width1 == width2; +} +bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) { + if (Op1->isReg()) { + if (Op2->isReg()) { + return Op1->getReg() == Op2->getReg(); + } + return false; + } + return Op1->isIdenticalTo(*Op2); +} + +bool validToPack(int HiStat, int LoStat, unsigned int &Mods, + MachineOperand *newOp, MachineOperand *RootOp, + const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { + if (newOp->isReg()) { + if (isSameBitWidth(newOp, RootOp, MRI)) { + // IS_LOWER_HALF remain 0 + if (HiStat == IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG_HI; + Mods |= SISrcMods::OP_SEL_1; + } else if (HiStat == IS_UPPER_HALF) { + Mods |= SISrcMods::OP_SEL_1; + } else if (HiStat == IS_LOWER_HALF_NEG) { + Mods ^= SISrcMods::NEG_HI; + } + if (LoStat == IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG; + Mods |= SISrcMods::OP_SEL_0; + } else if (LoStat == IS_UPPER_HALF) { + Mods |= SISrcMods::OP_SEL_0; + } else if (LoStat == IS_UPPER_HALF_NEG) { + Mods |= SISrcMods::NEG; + } + return true; + } + } else { + if ((HiStat == IS_SAME || HiStat == IS_NEG) && + (LoStat == IS_SAME || LoStat == IS_NEG) && + isInlinableConstant(newOp, TII)) { + if (HiStat == IS_NEG) { + Mods ^= SISrcMods::NEG_HI; + } + if (LoStat == IS_NEG) { + Mods ^= SISrcMods::NEG; + } + // opsel = opsel_hi = 0, since the upper half and lower half both + // the same as the target inlinable constant + return true; + } + } + return false; +} + +std::pair +AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op, + const MachineRegisterInfo &MRI, + bool IsDOT) const { + unsigned Mods = 0; + MachineOperand *RootOp = Op; + std::pair stat = getSrcStats(Op, MRI, true)[0]; + if (!stat.first->isReg()) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + if (stat.second == IS_NEG) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + } + Op = stat.first; + MachineInstr *MI = MRI.getVRegDef(Op->getReg()); + if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 && + (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { + std::vector> statList_Hi; + std::vector> statList_Lo; + statList_Hi = getSrcStats(&MI->getOperand(2), MRI); + if (statList_Hi.size() != 0) { + statList_Lo = getSrcStats(&MI->getOperand(1), MRI); + if (statList_Lo.size() != 0) { + for (int i = statList_Hi.size() - 1; i >= 0; i--) { + for (int j = statList_Lo.size() - 1; j >= 0; j--) { + if (isSameOperand(statList_Hi[i].first, statList_Lo[j].first)) { + if (validToPack(statList_Hi[i].second, statList_Lo[j].second, + Mods, statList_Hi[i].first, RootOp, TII, MRI)) { + return {statList_Hi[i].first, Mods}; + } + } + } + } + } + } + } // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; - return std::pair(Src, Mods); + return {Op, Mods}; +} + +int64_t getAllKindImm(MachineOperand *Op) { + switch (Op->getType()) { + case MachineOperand::MachineOperandType::MO_Immediate: + return Op->getImm(); + case MachineOperand::MachineOperandType::MO_CImmediate: + return Op->getCImm()->getSExtValue(); + break; + case MachineOperand::MachineOperandType::MO_FPImmediate: + return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); + break; + } + llvm_unreachable("not an imm type"); +} + +bool checkRB(MachineOperand *Op, int RBNo, const AMDGPURegisterBankInfo &RBI, + const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) { + const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI); + return RB->getID() == RBNo; +} + +MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp, + const AMDGPURegisterBankInfo &RBI, + MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + // RootOp can only be VGPR or SGPR (some hand written cases such as + // inst-select-ashr.v2s16.mir::ashr_v2s16_vs) + if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || + checkRB(newOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) { + return newOp; + } + MachineInstr *MI = MRI.getVRegDef(RootOp->getReg()); + if (MI->getOpcode() == AMDGPU::COPY && + isSameOperand(newOp, &MI->getOperand(1))) { + // RootOp is VGPR, newOp is not VGPR, but RootOp = COPY newOp + return RootOp; + } + + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(*RootOp, MRI); + Register dstReg = MRI.createVirtualRegister(DstRC); + + MachineIRBuilder B(*RootOp->getParent()); + MachineInstrBuilder MIB = + B.buildInstr(AMDGPU::COPY).addDef(dstReg).addUse(newOp->getReg()); + + // only accept VGPR + return &MIB->getOperand(0); } InstructionSelector::ComplexRendererFns @@ -4313,13 +4629,17 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - Register Src; - unsigned Mods; - std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); - + std::pair res = selectVOP3PModsImpl(&Root, MRI); + if (!(res.first->isReg())) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + }}; + } + res.first = getVReg(res.first, &Root, RBI, MRI, TRI); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods }}; } @@ -4328,13 +4648,18 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - Register Src; - unsigned Mods; - std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); - + std::pair res = + selectVOP3PModsImpl(&Root, MRI, true); + if (!(res.first->isReg())) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + }}; + } + res.first = getVReg(res.first, &Root, RBI, MRI, TRI); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods }}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index cc7552868a056..2af4f55403acc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; - std::pair - selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, + std::pair + selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT = false) const; InstructionSelector::ComplexRendererFns diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll index 1d9514c58ab9c..2243c57cf37ac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -68,8 +68,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index e2dab03e410aa..7d6cfac52714e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -248,8 +248,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX906-LABEL: v_sdot2_fnegf32_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegf32_c: @@ -263,8 +262,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX10-LABEL: v_sdot2_fnegf32_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %cast.neg.c = bitcast float %neg.c to i32 @@ -276,8 +274,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX906-LABEL: v_sdot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegv2f16_c: @@ -291,8 +288,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX10-LABEL: v_sdot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -304,8 +300,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: @@ -319,8 +314,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -331,8 +325,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: @@ -346,8 +339,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 06560afee3c9a..d6ef48e25cafb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -91,8 +91,7 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot4_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegf32_a: @@ -112,8 +111,7 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegv2f16_a: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll index 0d729351f65a7..d2aa47df81cbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll @@ -47,15 +47,13 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot8_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegf32_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -67,15 +65,13 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index 3acff52874dd9..347644826fd0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -235,22 +235,19 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX906-LABEL: v_udot2_fnegf32_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegf32_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegf32_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %cast.neg.c = bitcast float %neg.c to i32 @@ -262,22 +259,19 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX906-LABEL: v_udot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -289,22 +283,19 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -315,22 +306,19 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index b14af9e043e09..7ad0404942feb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -112,15 +112,13 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot4_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot4_fnegf32_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -132,15 +130,13 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll index a664c8aa508ef..52763bbc24e40 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll @@ -48,15 +48,13 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot8_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot8_fnegf32_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -68,15 +66,13 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 9b03a72fd826d..0577ba9b233be 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}} +; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fma_v2_v_lit_splat: ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 ; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}} +; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -556,8 +556,8 @@ bb: ; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { bb: %i12 = fadd <2 x float> zeroinitializer, %arg diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 3420596da2aac..c6349bcbcdbf1 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -477,9 +477,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-GISEL-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -519,8 +518,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: ; return to shader part epilog ; ; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -535,8 +533,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: -; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index aad7a088551b2..50921879cd1f2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 58464a3a24d90211f784f54f816095378a7569dc Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 7 Mar 2025 12:37:51 +0800 Subject: [PATCH 02/20] fix lit file --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 50921879cd1f2..aad7a088551b2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From daae1aeaefe49f5cbb14facf8c4535e431ab741a Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 10 Mar 2025 14:16:30 +0800 Subject: [PATCH 03/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 98 +++++++++---------- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 +- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 0dc47b957bdac..00d538f55a3cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4292,14 +4292,15 @@ enum srcStatus { LAST_STAT = IS_LOWER_HALF_NEG }; -bool isTruncHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { +static bool isTruncHalf(const MachineInstr *MI, + const MachineRegisterInfo &MRI) { assert(MI->getOpcode() == AMDGPU::G_TRUNC); unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); return dstSize * 2 == srcSize; } -bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { +static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { assert(MI->getOpcode() == AMDGPU::G_LSHR); Register ShiftSrc; std::optional ShiftAmt; @@ -4312,7 +4313,7 @@ bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { return false; } -bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { +static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { assert(MI->getOpcode() == AMDGPU::G_SHL); Register ShiftSrc; std::optional ShiftAmt; @@ -4325,8 +4326,8 @@ bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) { return false; } -bool retOpStat(MachineOperand *Op, int stat, - std::pair &curr) { +static bool retOpStat(const MachineOperand *Op, int stat, + std::pair &curr) { if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || Op->isCImm() || Op->isFPImm()) { curr = {Op, stat}; @@ -4335,15 +4336,14 @@ bool retOpStat(MachineOperand *Op, int stat, return false; } -bool calcNextStatus(std::pair &curr, - const MachineRegisterInfo &MRI) { +static bool calcNextStatus(std::pair &curr, + const MachineRegisterInfo &MRI) { if (!curr.first->isReg()) { return false; } - MachineInstr *MI = nullptr; + const MachineInstr *MI = nullptr; if (!curr.first->isDef()) { - // MRI.getVRegDef falls into infinite loop if use define reg MI = MRI.getVRegDef(curr.first->getReg()); } else { MI = curr.first->getParent(); @@ -4434,12 +4434,12 @@ bool calcNextStatus(std::pair &curr, return false; } -std::vector> -getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI, +SmallVector> +getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool onlyLastSameOrNeg = false, int maxDepth = 6) { int depth = 0; - std::pair curr = {Op, IS_SAME}; - std::vector> statList; + std::pair curr = {Op, IS_SAME}; + SmallVector> statList; while (true) { depth++; @@ -4460,27 +4460,23 @@ getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI, return statList; } -bool isInlinableConstant(MachineOperand *Op, const SIInstrInfo &TII) { - bool a = TII.isInlineConstant(*Op); - switch (Op->getType()) { - case MachineOperand::MachineOperandType::MO_Immediate: - return TII.isInlineConstant(*Op); - case MachineOperand::MachineOperandType::MO_CImmediate: - return TII.isInlineConstant(Op->getCImm()->getValue()); - case MachineOperand::MachineOperandType::MO_FPImmediate: - return TII.isInlineConstant(Op->getFPImm()->getValueAPF()); +static bool isInlinableConstant(const MachineOperand &Op, + const SIInstrInfo &TII) { + if (Op.isFPImm()) { + return TII.isInlineConstant(Op.getFPImm()->getValueAPF()); } return false; } -bool isSameBitWidth(MachineOperand *Op1, MachineOperand *Op2, - const MachineRegisterInfo &MRI) { +static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, + const MachineRegisterInfo &MRI) { unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits(); unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits(); return width1 == width2; } -bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) { +static bool isSameOperand(const MachineOperand *Op1, + const MachineOperand *Op2) { if (Op1->isReg()) { if (Op2->isReg()) { return Op1->getReg() == Op2->getReg(); @@ -4490,9 +4486,10 @@ bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) { return Op1->isIdenticalTo(*Op2); } -bool validToPack(int HiStat, int LoStat, unsigned int &Mods, - MachineOperand *newOp, MachineOperand *RootOp, - const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { +static bool validToPack(int HiStat, int LoStat, unsigned int &Mods, + const MachineOperand *newOp, + const MachineOperand *RootOp, const SIInstrInfo &TII, + const MachineRegisterInfo &MRI) { if (newOp->isReg()) { if (isSameBitWidth(newOp, RootOp, MRI)) { // IS_LOWER_HALF remain 0 @@ -4517,7 +4514,7 @@ bool validToPack(int HiStat, int LoStat, unsigned int &Mods, } else { if ((HiStat == IS_SAME || HiStat == IS_NEG) && (LoStat == IS_SAME || LoStat == IS_NEG) && - isInlinableConstant(newOp, TII)) { + isInlinableConstant(*newOp, TII)) { if (HiStat == IS_NEG) { Mods ^= SISrcMods::NEG_HI; } @@ -4532,13 +4529,13 @@ bool validToPack(int HiStat, int LoStat, unsigned int &Mods, return false; } -std::pair -AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op, +std::pair +AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; - MachineOperand *RootOp = Op; - std::pair stat = getSrcStats(Op, MRI, true)[0]; + const MachineOperand *RootOp = Op; + std::pair stat = getSrcStats(Op, MRI, true)[0]; if (!stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4550,8 +4547,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op, MachineInstr *MI = MRI.getVRegDef(Op->getReg()); if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 && (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { - std::vector> statList_Hi; - std::vector> statList_Lo; + SmallVector> statList_Hi; + SmallVector> statList_Lo; statList_Hi = getSrcStats(&MI->getOperand(2), MRI); if (statList_Hi.size() != 0) { statList_Lo = getSrcStats(&MI->getOperand(1), MRI); @@ -4575,30 +4572,29 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op, return {Op, Mods}; } -int64_t getAllKindImm(MachineOperand *Op) { +int64_t getAllKindImm(const MachineOperand *Op) { switch (Op->getType()) { case MachineOperand::MachineOperandType::MO_Immediate: return Op->getImm(); case MachineOperand::MachineOperandType::MO_CImmediate: return Op->getCImm()->getSExtValue(); - break; case MachineOperand::MachineOperandType::MO_FPImmediate: return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); - break; } llvm_unreachable("not an imm type"); } -bool checkRB(MachineOperand *Op, int RBNo, const AMDGPURegisterBankInfo &RBI, - const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) { +bool checkRB(const MachineOperand *Op, int RBNo, + const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI); return RB->getID() == RBNo; } -MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp, - const AMDGPURegisterBankInfo &RBI, - MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) { +const MachineOperand * +getVReg(const MachineOperand *newOp, const MachineOperand *RootOp, + const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { // RootOp can only be VGPR or SGPR (some hand written cases such as // inst-select-ashr.v2s16.mir::ashr_v2s16_vs) if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || @@ -4612,13 +4608,14 @@ MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp, return RootOp; } + MachineBasicBlock *BB = MI->getParent(); const TargetRegisterClass *DstRC = TRI.getConstrainedRegClassForOperand(*RootOp, MRI); Register dstReg = MRI.createVirtualRegister(DstRC); - MachineIRBuilder B(*RootOp->getParent()); MachineInstrBuilder MIB = - B.buildInstr(AMDGPU::COPY).addDef(dstReg).addUse(newOp->getReg()); + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg) + .addReg(newOp->getReg()); // only accept VGPR return &MIB->getOperand(0); @@ -4629,14 +4626,15 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair res = selectVOP3PModsImpl(&Root, MRI); + std::pair res = + selectVOP3PModsImpl(&Root, MRI); if (!(res.first->isReg())) { return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods }}; } - res.first = getVReg(res.first, &Root, RBI, MRI, TRI); + res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods @@ -4648,7 +4646,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair res = + std::pair res = selectVOP3PModsImpl(&Root, MRI, true); if (!(res.first->isReg())) { return {{ @@ -4656,7 +4654,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods }}; } - res.first = getVReg(res.first, &Root, RBI, MRI, TRI); + res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2af4f55403acc..dd172edfdf216 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; - std::pair - selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI, + std::pair + selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT = false) const; InstructionSelector::ComplexRendererFns From 2e587f5fbcc23f6574c4f6f7b86974f0c6352ca4 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 12 Mar 2025 11:18:17 +0800 Subject: [PATCH 04/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 122 ++++++++---------- llvm/test/lit.cfg.py | 2 +- 2 files changed, 58 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 00d538f55a3cf..622b1bd3f5bf5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4289,19 +4289,23 @@ enum srcStatus { IS_NEG, IS_UPPER_HALF_NEG, IS_LOWER_HALF_NEG, - LAST_STAT = IS_LOWER_HALF_NEG + INVALID }; static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - assert(MI->getOpcode() == AMDGPU::G_TRUNC); + if (MI->getOpcode() != AMDGPU::G_TRUNC) { + return false; + } unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); return dstSize * 2 == srcSize; } static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - assert(MI->getOpcode() == AMDGPU::G_LSHR); + if (MI->getOpcode() != AMDGPU::G_LSHR) { + return false; + } Register ShiftSrc; std::optional ShiftAmt; if (mi_match(MI->getOperand(0).getReg(), MRI, @@ -4314,7 +4318,9 @@ static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { } static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - assert(MI->getOpcode() == AMDGPU::G_SHL); + if (MI->getOpcode() != AMDGPU::G_SHL) { + return false; + } Register ShiftSrc; std::optional ShiftAmt; if (mi_match(MI->getOperand(0).getReg(), MRI, @@ -4326,8 +4332,11 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { return false; } -static bool retOpStat(const MachineOperand *Op, int stat, - std::pair &curr) { +static bool retOpStat(const MachineOperand *Op, srcStatus stat, + std::pair &curr) { + if (stat == INVALID) { + return false; + } if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || Op->isCImm() || Op->isFPImm()) { curr = {Op, stat}; @@ -4336,7 +4345,25 @@ static bool retOpStat(const MachineOperand *Op, int stat, return false; } -static bool calcNextStatus(std::pair &curr, +srcStatus getNegStatus(srcStatus S) { + switch (S) { + case IS_SAME: + return IS_NEG; + case IS_UPPER_HALF: + return IS_UPPER_HALF_NEG; + case IS_LOWER_HALF: + return IS_LOWER_HALF_NEG; + case IS_NEG: + return IS_SAME; + case IS_UPPER_HALF_NEG: + return IS_UPPER_HALF; + case IS_LOWER_HALF_NEG: + return IS_LOWER_HALF; + } + return INVALID; +} + +static bool calcNextStatus(std::pair &curr, const MachineRegisterInfo &MRI) { if (!curr.first->isReg()) { return false; @@ -4363,92 +4390,56 @@ static bool calcNextStatus(std::pair &curr, return retOpStat(&MI->getOperand(1), curr.second, curr); case AMDGPU::G_FNEG: // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX - return retOpStat(&MI->getOperand(1), - (curr.second + ((LAST_STAT + 1) / 2)) % (LAST_STAT + 1), - curr); + return retOpStat(&MI->getOperand(1), getNegStatus(curr.second), curr); } // Calc next stat from current stat switch (curr.second) { case IS_SAME: - switch (Opc) { - case AMDGPU::G_TRUNC: { - if (isTruncHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); - } - break; - } + if (isTruncHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); } break; case IS_NEG: - switch (Opc) { - case AMDGPU::G_TRUNC: { - if (isTruncHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); - } - break; - } + if (isTruncHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); } break; case IS_UPPER_HALF: - switch (Opc) { - case AMDGPU::G_SHL: { - if (isShlHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); - } - break; - } + if (isShlHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); } break; case IS_LOWER_HALF: - switch (Opc) { - case AMDGPU::G_LSHR: { - if (isLshrHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr); - } - break; - } + if (isLshrHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr); } break; case IS_UPPER_HALF_NEG: - switch (Opc) { - case AMDGPU::G_SHL: { - if (isShlHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); - } - break; - } + if (isShlHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); } break; case IS_LOWER_HALF_NEG: - switch (Opc) { - case AMDGPU::G_LSHR: { - if (isLshrHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr); - } - break; - } + if (isLshrHalf(MI, MRI)) { + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr); } break; } return false; } -SmallVector> +SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool onlyLastSameOrNeg = false, int maxDepth = 6) { int depth = 0; - std::pair curr = {Op, IS_SAME}; - SmallVector> statList; + std::pair curr = {Op, IS_SAME}; + SmallVector> statList; - while (true) { + while (depth <= maxDepth && calcNextStatus(curr, MRI)) { depth++; - if (depth > maxDepth) { - break; - } - bool ret = calcNextStatus(curr, MRI); - if (!ret || (onlyLastSameOrNeg && - (curr.second != IS_SAME && curr.second != IS_NEG))) { + if ((onlyLastSameOrNeg && + (curr.second != IS_SAME && curr.second != IS_NEG))) { break; } else if (!onlyLastSameOrNeg) { statList.push_back(curr); @@ -4535,7 +4526,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, bool IsDOT) const { unsigned Mods = 0; const MachineOperand *RootOp = Op; - std::pair stat = getSrcStats(Op, MRI, true)[0]; + std::pair stat = + getSrcStats(Op, MRI, true)[0]; if (!stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4547,8 +4539,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, MachineInstr *MI = MRI.getVRegDef(Op->getReg()); if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 && (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { - SmallVector> statList_Hi; - SmallVector> statList_Lo; + SmallVector> statList_Hi; + SmallVector> statList_Lo; statList_Hi = getSrcStats(&MI->getOperand(2), MRI); if (statList_Hi.size() != 0) { statList_Lo = getSrcStats(&MI->getOperand(1), MRI); diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index aad7a088551b2..50921879cd1f2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From c6c4b3e92063adae31e67fbad6d64f6f77f71324 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 12 Mar 2025 16:39:19 +0800 Subject: [PATCH 05/20] fix comments --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 10 +++------- llvm/test/lit.cfg.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 622b1bd3f5bf5..59ccb1b7ed236 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4288,8 +4288,7 @@ enum srcStatus { IS_LOWER_HALF, IS_NEG, IS_UPPER_HALF_NEG, - IS_LOWER_HALF_NEG, - INVALID + IS_LOWER_HALF_NEG }; static bool isTruncHalf(const MachineInstr *MI, @@ -4334,9 +4333,6 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { static bool retOpStat(const MachineOperand *Op, srcStatus stat, std::pair &curr) { - if (stat == INVALID) { - return false; - } if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || Op->isCImm() || Op->isFPImm()) { curr = {Op, stat}; @@ -4360,7 +4356,7 @@ srcStatus getNegStatus(srcStatus S) { case IS_LOWER_HALF_NEG: return IS_LOWER_HALF; } - return INVALID; + llvm_unreachable("unexpected srcStatus"); } static bool calcNextStatus(std::pair &curr, @@ -4477,7 +4473,7 @@ static bool isSameOperand(const MachineOperand *Op1, return Op1->isIdenticalTo(*Op2); } -static bool validToPack(int HiStat, int LoStat, unsigned int &Mods, +static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods, const MachineOperand *newOp, const MachineOperand *RootOp, const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 50921879cd1f2..aad7a088551b2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 6378180cc336aef22a0c256f26321c4cecedae24 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 17 Mar 2025 11:10:42 +0800 Subject: [PATCH 06/20] fix comments and test case --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 216 +++++++++--------- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 3 +- 2 files changed, 105 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 58355c6ee7f43..6ffb5bc4a788a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4293,7 +4293,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { }}; } -enum srcStatus { +enum SrcStatus { IS_SAME, IS_UPPER_HALF, IS_LOWER_HALF, @@ -4304,55 +4304,55 @@ enum srcStatus { static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_TRUNC) { + if (MI->getOpcode() != AMDGPU::G_TRUNC) return false; - } - unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); - unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - return dstSize * 2 == srcSize; + + unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + return DstSize * 2 == SrcSize; } static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_LSHR) { + if (MI->getOpcode() != AMDGPU::G_LSHR) return false; - } + Register ShiftSrc; std::optional ShiftAmt; if (mi_match(MI->getOperand(0).getReg(), MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { - unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - unsigned shift = ShiftAmt->Value.getZExtValue(); - return shift * 2 == srcSize; + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned Shift = ShiftAmt->Value.getZExtValue(); + return Shift * 2 == SrcSize; } return false; } static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_SHL) { + if (MI->getOpcode() != AMDGPU::G_SHL) return false; - } + Register ShiftSrc; std::optional ShiftAmt; if (mi_match(MI->getOperand(0).getReg(), MRI, m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { - unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - unsigned shift = ShiftAmt->Value.getZExtValue(); - return shift * 2 == srcSize; + unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); + unsigned Shift = ShiftAmt->Value.getZExtValue(); + return Shift * 2 == SrcSize; } return false; } -static bool retOpStat(const MachineOperand *Op, srcStatus stat, - std::pair &curr) { +static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, + std::pair &Curr) { if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || - Op->isCImm() || Op->isFPImm()) { - curr = {Op, stat}; - return true; - } + Op->isCImm() || Op->isFPImm()) + Curr = {Op, Stat}; + return true; + return false; } -srcStatus getNegStatus(srcStatus S) { +SrcStatus getNegStatus(SrcStatus S) { switch (S) { case IS_SAME: return IS_NEG; @@ -4367,24 +4367,23 @@ srcStatus getNegStatus(srcStatus S) { case IS_LOWER_HALF_NEG: return IS_LOWER_HALF; } - llvm_unreachable("unexpected srcStatus"); + llvm_unreachable("unexpected SrcStatus"); } -static bool calcNextStatus(std::pair &curr, +static bool calcNextStatus(std::pair &Curr, const MachineRegisterInfo &MRI) { - if (!curr.first->isReg()) { + if (!Curr.first->isReg()) return false; - } + const MachineInstr *MI = nullptr; - if (!curr.first->isDef()) { - MI = MRI.getVRegDef(curr.first->getReg()); + if (!Curr.first->isDef()) { + MI = MRI.getVRegDef(Curr.first->getReg()); } else { - MI = curr.first->getParent(); + MI = Curr.first->getParent(); } - if (!MI) { + if (!MI) return false; - } unsigned Opc = MI->getOpcode(); @@ -4394,83 +4393,77 @@ static bool calcNextStatus(std::pair &curr, case AMDGPU::G_CONSTANT: case AMDGPU::G_FCONSTANT: case AMDGPU::COPY: - return retOpStat(&MI->getOperand(1), curr.second, curr); + return retOpStat(&MI->getOperand(1), Curr.second, Curr); case AMDGPU::G_FNEG: // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX - return retOpStat(&MI->getOperand(1), getNegStatus(curr.second), curr); + return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr); } - // Calc next stat from current stat - switch (curr.second) { + // Calc next Stat from current Stat + switch (Curr.second) { case IS_SAME: - if (isTruncHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); - } + if (isTruncHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr); break; case IS_NEG: - if (isTruncHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); - } + if (isTruncHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr); break; case IS_UPPER_HALF: - if (isShlHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr); - } + if (isShlHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr); break; case IS_LOWER_HALF: - if (isLshrHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr); - } + if (isLshrHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, Curr); break; case IS_UPPER_HALF_NEG: - if (isShlHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr); - } + if (isShlHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr); break; case IS_LOWER_HALF_NEG: - if (isLshrHalf(MI, MRI)) { - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr); - } + if (isLshrHalf(MI, MRI)) + return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, Curr); break; } return false; } -SmallVector> +SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool onlyLastSameOrNeg = false, int maxDepth = 6) { int depth = 0; - std::pair curr = {Op, IS_SAME}; - SmallVector> statList; + std::pair Curr = {Op, IS_SAME}; + SmallVector> Statlist; - while (depth <= maxDepth && calcNextStatus(curr, MRI)) { + while (depth <= maxDepth && calcNextStatus(Curr, MRI)) { depth++; if ((onlyLastSameOrNeg && - (curr.second != IS_SAME && curr.second != IS_NEG))) { + (Curr.second != IS_SAME && Curr.second != IS_NEG))) { break; } else if (!onlyLastSameOrNeg) { - statList.push_back(curr); + Statlist.push_back(Curr); } } if (onlyLastSameOrNeg) { - statList.push_back(curr); + Statlist.push_back(Curr); } - return statList; + return Statlist; } static bool isInlinableConstant(const MachineOperand &Op, const SIInstrInfo &TII) { - if (Op.isFPImm()) { + if (Op.isFPImm()) return TII.isInlineConstant(Op.getFPImm()->getValueAPF()); - } + return false; } static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, const MachineRegisterInfo &MRI) { - unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits(); - unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits(); - return width1 == width2; + unsigned Width1 = MRI.getType(Op1->getReg()).getSizeInBits(); + unsigned Width2 = MRI.getType(Op2->getReg()).getSizeInBits(); + return Width1 == Width2; } static bool isSameOperand(const MachineOperand *Op1, @@ -4484,12 +4477,12 @@ static bool isSameOperand(const MachineOperand *Op1, return Op1->isIdenticalTo(*Op2); } -static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods, - const MachineOperand *newOp, +static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods, + const MachineOperand *NewOp, const MachineOperand *RootOp, const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { - if (newOp->isReg()) { - if (isSameBitWidth(newOp, RootOp, MRI)) { + if (NewOp->isReg()) { + if (isSameBitWidth(NewOp, RootOp, MRI)) { // IS_LOWER_HALF remain 0 if (HiStat == IS_UPPER_HALF_NEG) { Mods ^= SISrcMods::NEG_HI; @@ -4512,7 +4505,7 @@ static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods, } else { if ((HiStat == IS_SAME || HiStat == IS_NEG) && (LoStat == IS_SAME || LoStat == IS_NEG) && - isInlinableConstant(*newOp, TII)) { + isInlinableConstant(*NewOp, TII)) { if (HiStat == IS_NEG) { Mods ^= SISrcMods::NEG_HI; } @@ -4533,31 +4526,31 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, bool IsDOT) const { unsigned Mods = 0; const MachineOperand *RootOp = Op; - std::pair stat = + std::pair Stat = getSrcStats(Op, MRI, true)[0]; - if (!stat.first->isReg()) { + if (!Stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - if (stat.second == IS_NEG) { + if (Stat.second == IS_NEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); } - Op = stat.first; + Op = Stat.first; MachineInstr *MI = MRI.getVRegDef(Op->getReg()); if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 && (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { - SmallVector> statList_Hi; - SmallVector> statList_Lo; - statList_Hi = getSrcStats(&MI->getOperand(2), MRI); - if (statList_Hi.size() != 0) { - statList_Lo = getSrcStats(&MI->getOperand(1), MRI); - if (statList_Lo.size() != 0) { - for (int i = statList_Hi.size() - 1; i >= 0; i--) { - for (int j = statList_Lo.size() - 1; j >= 0; j--) { - if (isSameOperand(statList_Hi[i].first, statList_Lo[j].first)) { - if (validToPack(statList_Hi[i].second, statList_Lo[j].second, - Mods, statList_Hi[i].first, RootOp, TII, MRI)) { - return {statList_Hi[i].first, Mods}; + SmallVector> Statlist_Hi; + SmallVector> Statlist_Lo; + Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI); + if (Statlist_Hi.size() != 0) { + Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI); + if (Statlist_Lo.size() != 0) { + for (int i = Statlist_Hi.size() - 1; i >= 0; i--) { + for (int j = Statlist_Lo.size() - 1; j >= 0; j--) { + if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) { + if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, + Mods, Statlist_Hi[i].first, RootOp, TII, MRI)) { + return {Statlist_Hi[i].first, Mods}; } } } @@ -4591,21 +4584,20 @@ bool checkRB(const MachineOperand *Op, int RBNo, } const MachineOperand * -getVReg(const MachineOperand *newOp, const MachineOperand *RootOp, +getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { // RootOp can only be VGPR or SGPR (some hand written cases such as // inst-select-ashr.v2s16.mir::ashr_v2s16_vs) if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || - checkRB(newOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) { - return newOp; - } + checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) + return NewOp; + MachineInstr *MI = MRI.getVRegDef(RootOp->getReg()); if (MI->getOpcode() == AMDGPU::COPY && - isSameOperand(newOp, &MI->getOperand(1))) { - // RootOp is VGPR, newOp is not VGPR, but RootOp = COPY newOp + isSameOperand(NewOp, &MI->getOperand(1))) + // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp return RootOp; - } MachineBasicBlock *BB = MI->getParent(); const TargetRegisterClass *DstRC = @@ -4614,7 +4606,7 @@ getVReg(const MachineOperand *newOp, const MachineOperand *RootOp, MachineInstrBuilder MIB = BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg) - .addReg(newOp->getReg()); + .addReg(NewOp->getReg()); // only accept VGPR return &MIB->getOperand(0); @@ -4625,18 +4617,18 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair res = + std::pair Res = selectVOP3PModsImpl(&Root, MRI); - if (!(res.first->isReg())) { + if (!(Res.first->isReg())) return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods }}; - } - res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII); + + Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods }}; } @@ -4645,18 +4637,18 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair res = + std::pair Res = selectVOP3PModsImpl(&Root, MRI, true); - if (!(res.first->isReg())) { + if (!(Res.first->isReg())) return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods }}; - } - res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII); + + Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods }}; } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index b561fa86f452c..c766ad2c418fd 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -824,8 +824,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 -; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-GISEL-NEXT: ; return to shader part epilog ; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: From b0feaff09679e869330f818be2efb550483aaeb2 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 18 Mar 2025 09:37:06 +0800 Subject: [PATCH 07/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 63 ++++++++++--------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 6ffb5bc4a788a..51735cf8daff4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4445,9 +4445,8 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, Statlist.push_back(Curr); } } - if (onlyLastSameOrNeg) { + if (onlyLastSameOrNeg) Statlist.push_back(Curr); - } return Statlist; } @@ -4469,9 +4468,8 @@ static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, static bool isSameOperand(const MachineOperand *Op1, const MachineOperand *Op2) { if (Op1->isReg()) { - if (Op2->isReg()) { + if (Op2->isReg()) return Op1->getReg() == Op2->getReg(); - } return false; } return Op1->isIdenticalTo(*Op2); @@ -4506,12 +4504,10 @@ static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods, if ((HiStat == IS_SAME || HiStat == IS_NEG) && (LoStat == IS_SAME || LoStat == IS_NEG) && isInlinableConstant(*NewOp, TII)) { - if (HiStat == IS_NEG) { + if (HiStat == IS_NEG) Mods ^= SISrcMods::NEG_HI; - } - if (LoStat == IS_NEG) { + if (LoStat == IS_NEG) Mods ^= SISrcMods::NEG; - } // opsel = opsel_hi = 0, since the upper half and lower half both // the same as the target inlinable constant return true; @@ -4532,29 +4528,40 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - if (Stat.second == IS_NEG) { + if (Stat.second == IS_NEG) Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - } + Op = Stat.first; MachineInstr *MI = MRI.getVRegDef(Op->getReg()); - if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 && - (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { - SmallVector> Statlist_Hi; - SmallVector> Statlist_Lo; - Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI); - if (Statlist_Hi.size() != 0) { - Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI); - if (Statlist_Lo.size() != 0) { - for (int i = Statlist_Hi.size() - 1; i >= 0; i--) { - for (int j = Statlist_Lo.size() - 1; j >= 0; j--) { - if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) { - if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, - Mods, Statlist_Hi[i].first, RootOp, TII, MRI)) { - return {Statlist_Hi[i].first, Mods}; - } - } - } - } + + if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 || + (IsDOT && Subtarget->hasDOTOpSelHazard())) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + + SmallVector> Statlist_Hi; + Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI); + + if (Statlist_Hi.size() == 0) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + + SmallVector> Statlist_Lo; + Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI); + + if (Statlist_Lo.size() == 0) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + + for (int i = Statlist_Hi.size() - 1; i >= 0; i--) { + for (int j = Statlist_Lo.size() - 1; j >= 0; j--) { + if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) { + if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, Mods, + Statlist_Hi[i].first, RootOp, TII, MRI)) + return {Statlist_Hi[i].first, Mods}; } } } From 53370d8b98879a55bef1094b13a7e18195a315e6 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 18 Mar 2025 09:41:17 +0800 Subject: [PATCH 08/20] fix conflict --- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 2003 ++++++++++++++++++++--- llvm/test/lit.cfg.py | 2 +- 2 files changed, 1811 insertions(+), 194 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 954c05e63542f..28a995e74f7ab 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1,13 +1,34 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s - -; GCN-LABEL: {{^}}fadd_v2_vv: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s + define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -16,10 +37,30 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_vs: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fadd_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -28,10 +69,49 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fadd_v4_vs: -; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fadd_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v3, s3, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX900-NEXT: v_add_f32_e32 v1, s1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -40,10 +120,163 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fadd_v32_vs: -; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fadd_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 +; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 +; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 +; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 +; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 +; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 +; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 +; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 +; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 +; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 +; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 +; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 +; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 +; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 +; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 +; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 +; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 +; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 +; GFX900-NEXT: v_add_f32_e32 v21, s8, v21 +; GFX900-NEXT: v_add_f32_e32 v28, s23, v28 +; GFX900-NEXT: v_add_f32_e32 v27, s22, v27 +; GFX900-NEXT: v_add_f32_e32 v26, s21, v26 +; GFX900-NEXT: v_add_f32_e32 v25, s20, v25 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[48:49] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[44:45] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[16:17] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[18:19] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[10:11] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[20:21] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[22:23] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[36:37] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[40:41] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[44:45] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[48:49] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[8:9] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[16:17] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[20:21] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -53,13 +286,45 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { } ; FIXME: GISel does not use op_sel for splat constants. - -; GCN-LABEL: {{^}}fadd_v2_v_imm: -; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 0x42c80000, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 0x42c80000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_imm: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -68,11 +333,43 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_v_splat: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -84,11 +381,42 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -97,12 +425,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]] define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_hi0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_lit_hi0: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b64 s[2:3], 0x3f800000 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -111,13 +458,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s[[LO:[0-9]+]], 0 -; PACKED-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}} define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_lit_lo0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_lit_lo0: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 0 +; PACKED-NEXT: s_mov_b32 s3, 1.0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -126,13 +492,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 1.0 -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 2.0 -; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fadd_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fadd_v2_v_unfoldable_lit: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 1.0 +; PACKED-NEXT: s_mov_b32 s3, 2.0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -142,12 +527,47 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { } ; FIXME: Fold fneg into v_pk_add_f32 with Global ISel. - -; GCN-LABEL: {{^}}fadd_v2_v_fneg: -; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -159,12 +579,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg_lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s2 +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -176,12 +631,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fadd_v2_v_fneg_hi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -193,12 +683,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) { +; GFX900-LABEL: fadd_v2_v_fneg_lo2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo2: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo2: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -210,12 +732,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ret void } -; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2: -; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) { +; GFX900-LABEL: fadd_v2_v_fneg_hi2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s3, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi2: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi2: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -227,10 +781,30 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ret void } -; GCN-LABEL: {{^}}fmul_v2_vv: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -239,10 +813,30 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_vs: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fmul_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -251,10 +845,49 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v4_vs: -; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fmul_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX900-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -263,10 +896,163 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v32_vs: -; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fmul_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 +; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 +; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 +; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 +; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 +; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 +; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 +; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 +; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 +; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 +; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 +; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 +; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 +; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 +; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 +; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 +; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 +; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 +; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 +; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 +; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21 +; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28 +; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27 +; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26 +; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[48:49] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[44:45] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[16:17] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[18:19] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[10:11] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[20:21] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[22:23] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[36:37] +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[40:41] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[44:45] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[48:49] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[8:9] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[16:17] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[20:21] +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -275,12 +1061,45 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_imm: -; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 0x42c80000, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 0x42c80000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_imm: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-GISEL-NEXT: s_mov_b32 s3, s2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -289,11 +1108,43 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_v_splat: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -305,11 +1156,42 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -318,13 +1200,32 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit: -; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}} -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 -; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 -; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fmul_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fmul_v2_v_unfoldable_lit: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_mov_b32 s2, 4.0 +; PACKED-NEXT: s_mov_b32 s3, 0x40400000 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -333,11 +1234,47 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fmul_v2_v_fneg: -; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}} -; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fmul_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e64 v1, v1, -s2 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, -s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fmul_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fmul_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -349,10 +1286,30 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}fma_v2_vv: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_vv: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, v1, v1 +; GFX900-NEXT: v_fma_f32 v0, v0, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fma_v2_vv: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -361,10 +1318,30 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_vs: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fma_v2_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s3, s3 +; GFX900-NEXT: v_fma_f32 v0, v0, s2, s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-LABEL: fma_v2_vs: +; PACKED: ; %bb.0: +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] +; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -373,10 +1350,49 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v4_vs: -; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; GFX900-LABEL: fma_v4_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v3, v3, s3, s3 +; GFX900-NEXT: v_fma_f32 v2, v2, s2, s2 +; GFX900-NEXT: v_fma_f32 v1, v1, s1, s1 +; GFX900-NEXT: v_fma_f32 v0, v0, s0, s0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v4_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v4_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id %load = load <4 x float>, ptr addrspace(1) %gep, align 16 @@ -385,10 +1401,163 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v32_vs: -; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; PACKED-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; GFX900-LABEL: fma_v32_vs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 +; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 +; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 +; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 +; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 +; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 +; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 +; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 +; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 +; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 +; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 +; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 +; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 +; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 +; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 +; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 +; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 +; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 +; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 +; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 +; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 +; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 +; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 +; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 +; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 +; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 +; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 +; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8 +; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23 +; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22 +; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21 +; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20 +; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 +; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 +; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v32_vs: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] +; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37] +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48 +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] +; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v32_vs: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[40:41], s[40:41] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[42:43], s[42:43] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[44:45], s[44:45] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[46:47], s[46:47] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[48:49], s[48:49] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[50:51], s[50:51] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[8:9], s[8:9] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[10:11], s[10:11] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[16:17], s[16:17] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[18:19], s[18:19] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[20:21], s[20:21] +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[22:23], s[22:23] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id %load = load <32 x float>, ptr addrspace(1) %gep, align 128 @@ -397,14 +1566,34 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_imm: -; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000 -; GFX900-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 -; PACKED-SDAG-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]] -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_imm: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_mov_b32 s2, 0x42c80000 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x43480000 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s2, v3 +; GFX900-NEXT: v_fma_f32 v0, v0, s2, v3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_imm: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0x43480000 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -413,11 +1602,43 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_v_splat: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0 -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1]{{$}} define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_v_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v2, v2, v0, v0 +; GFX900-NEXT: v_fma_f32 v1, v1, v0, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_v_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_v_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -429,11 +1650,42 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_lit_splat: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_lit_splat: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, 4.0, 1.0 +; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_lit_splat: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_lit_splat: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -442,15 +1694,35 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 -; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 -; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0 -; PACKED-SDAG-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 -; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 -; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; GFX900-LABEL: fma_v2_v_unfoldable_lit: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_mov_b32 s2, 0x40400000 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, s2, 2.0 +; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_unfoldable_lit: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-SDAG-NEXT: s_mov_b32 s2, 4.0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 2.0 +; PACKED-SDAG-NEXT: s_mov_b32 s3, 0x40400000 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -459,11 +1731,47 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fma_v2_v_fneg: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; GFX900-LABEL: fma_v2_v_fneg: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, -s2, -s2 +; GFX900-NEXT: v_fma_f32 v0, v0, -s2, -s2 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_v2_v_fneg: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_v2_v_fneg: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] +; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -475,11 +1783,51 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ret void } -; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: -; GFX900-COUNT-2: v_sub_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; GFX900-LABEL: add_vector_neg_bitcast_scalar_lo: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s3 +; GFX900-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX900-NEXT: ds_read_b32 v2, v2 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; PACKED-SDAG-NEXT: ds_read_b32 v2, v2 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; PACKED-GISEL-NEXT: ds_read_b32 v2, v2 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4 %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4 @@ -493,11 +1841,59 @@ bb: ret void } -; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: -; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; GFX900-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX900-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX900-NEXT: ds_read_b32 v5, v4 +; GFX900-NEXT: ds_read_b32 v4, v4 offset:8 +; GFX900-NEXT: s_waitcnt lgkmcnt(1) +; GFX900-NEXT: v_fma_f32 v0, v0, v2, -v5 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_fma_f32 v1, v1, v3, -v4 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; PACKED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; PACKED-SDAG-NEXT: ds_read_b32 v4, v5 +; PACKED-SDAG-NEXT: ds_read_b32 v5, v5 offset:8 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; PACKED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; PACKED-GISEL-NEXT: ds_read_b32 v4, v5 +; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2 @@ -517,11 +1913,51 @@ bb: ret void } -; GCN-LABEL: {{^}}shuffle_add_f32: -; GFX900-COUNT-2: v_add_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GFX900-LABEL: shuffle_add_f32: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: ds_read_b64 v[0:1], v2 +; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: shuffle_add_f32: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: shuffle_add_f32: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -532,11 +1968,61 @@ bb: ret void } -; GCN-LABEL: {{^}}shuffle_neg_add_f32: -; GFX900-COUNT-2: v_sub_f32_e32 -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; GFX900-LABEL: shuffle_neg_add_f32: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: ds_read_b64 v[0:1], v2 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_read_b32 v3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: shuffle_neg_add_f32: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: ds_read_b32 v3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: shuffle_neg_add_f32: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2 +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: ds_read_b32 v3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v2 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v3 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 @@ -549,16 +2035,26 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_fadd_fsub_0: -; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 -; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} - -; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 -; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} - -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { +; GFX900-LABEL: fadd_fadd_fsub_0: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e64 v0, s1, 0 +; GFX900-NEXT: v_add_f32_e32 v1, 0, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s0 +; GFX900-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_fadd_fsub_0: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_add_f32_e64 v0, s1, 0 +; PACKED-SDAG-NEXT: v_add_f32_e32 v1, 0, v0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; PACKED-SDAG-NEXT: s_endpgm bb: %i12 = fadd <2 x float> zeroinitializer, %arg %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -569,16 +2065,36 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_fadd_fsub: -; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} - -; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} - -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) { +; GFX900-LABEL: fadd_fadd_fsub: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s3 +; GFX900-NEXT: v_add_f32_e32 v0, s1, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_add_f32_e32 v3, s2, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1 +; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_fadd_fsub: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0 +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0 +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; PACKED-SDAG-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> @@ -589,11 +2105,48 @@ bb: ret void } -; GCN-LABEL: {{^}}fadd_shuffle_v4: -; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} -; PACKED-GISEL-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { +; GFX900-LABEL: fadd_shuffle_v4: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fadd_shuffle_v4: +; PACKED-SDAG: ; %bb.0: ; %bb +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0] +; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fadd_shuffle_v4: +; PACKED-GISEL: ; %bb.0: ; %bb +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v0 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5] +; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid @@ -604,12 +2157,44 @@ bb: ret void } -; GCN-LABEL: {{^}}fneg_v2f32_vec: -; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} -; PACKED-GISEL-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] op_sel_hi:[0,1]{{$}} define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { +; GFX900-LABEL: fneg_v2f32_vec: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX900-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fneg_v2f32_vec: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0) +; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1] +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fneg_v2f32_vec: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) +; PACKED-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -618,9 +2203,41 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ret void } -; GCN-LABEL: {{^}}fneg_v2f32_scalar: -; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) { +; GFX900-LABEL: fneg_v2f32_scalar: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX900-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX900-NEXT: v_mov_b32_e32 v0, s2 +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-NEXT: s_endpgm +; +; PACKED-SDAG-LABEL: fneg_v2f32_scalar: +; PACKED-SDAG: ; %bb.0: +; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 +; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: s_endpgm +; +; PACKED-GISEL-LABEL: fneg_v2f32_scalar: +; PACKED-GISEL: ; %bb.0: +; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-GISEL-NEXT: s_xor_b32 s2, s2, 0x80000000 +; PACKED-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-GISEL-NEXT: s_endpgm %fneg = fsub <2 x float> , %x store <2 x float> %fneg, ptr addrspace(1) %a, align 8 ret void diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index aad7a088551b2..50921879cd1f2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 3f178d21be053c26859291801a53291d2ceb4c9c Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 18 Mar 2025 09:41:53 +0800 Subject: [PATCH 09/20] fix lit --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 50921879cd1f2..aad7a088551b2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 79b89922430478452a3571aaab1e1cf6ee075837 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 18 Mar 2025 13:37:00 +0800 Subject: [PATCH 10/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9722ba3c7b203..48d5b8b4cdfca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4362,9 +4362,10 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, std::pair &Curr) { if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || - Op->isCImm() || Op->isFPImm()) + Op->isCImm() || Op->isFPImm()) { Curr = {Op, Stat}; - return true; + return true; + } return false; } @@ -4394,11 +4395,11 @@ static bool calcNextStatus(std::pair &Curr, const MachineInstr *MI = nullptr; - if (!Curr.first->isDef()) { + if (!Curr.first->isDef()) MI = MRI.getVRegDef(Curr.first->getReg()); - } else { + else MI = Curr.first->getParent(); - } + if (!MI) return false; @@ -4448,31 +4449,28 @@ static bool calcNextStatus(std::pair &Curr, SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, - bool onlyLastSameOrNeg = false, int maxDepth = 6) { - int depth = 0; + bool OnlyLastSameOrNeg = false, int MaxDepth = 6) { + int Depth = 0; std::pair Curr = {Op, IS_SAME}; SmallVector> Statlist; - while (depth <= maxDepth && calcNextStatus(Curr, MRI)) { - depth++; - if ((onlyLastSameOrNeg && - (Curr.second != IS_SAME && Curr.second != IS_NEG))) { + while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { + Depth++; + if ((OnlyLastSameOrNeg && + (Curr.second != IS_SAME && Curr.second != IS_NEG))) break; - } else if (!onlyLastSameOrNeg) { + + if (!OnlyLastSameOrNeg) Statlist.push_back(Curr); - } } - if (onlyLastSameOrNeg) + if (OnlyLastSameOrNeg) Statlist.push_back(Curr); return Statlist; } static bool isInlinableConstant(const MachineOperand &Op, const SIInstrInfo &TII) { - if (Op.isFPImm()) - return TII.isInlineConstant(Op.getFPImm()->getValueAPF()); - - return false; + return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF()); } static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, @@ -4492,10 +4490,10 @@ static bool isSameOperand(const MachineOperand *Op1, return Op1->isIdenticalTo(*Op2); } -static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods, - const MachineOperand *NewOp, - const MachineOperand *RootOp, const SIInstrInfo &TII, - const MachineRegisterInfo &MRI) { +static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, + unsigned int &Mods, const MachineOperand *NewOp, + const MachineOperand *RootOp, const SIInstrInfo &TII, + const MachineRegisterInfo &MRI) { if (NewOp->isReg()) { if (isSameBitWidth(NewOp, RootOp, MRI)) { // IS_LOWER_HALF remain 0 @@ -4557,29 +4555,28 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, return {Op, Mods}; } - SmallVector> Statlist_Hi; - Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI); + SmallVector> StatlistHi; + StatlistHi = getSrcStats(&MI->getOperand(2), MRI); - if (Statlist_Hi.size() == 0) { + if (StatlistHi.size() == 0) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - SmallVector> Statlist_Lo; - Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI); + SmallVector> StatlistLo; + StatlistLo = getSrcStats(&MI->getOperand(1), MRI); - if (Statlist_Lo.size() == 0) { + if (StatlistLo.size() == 0) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - for (int i = Statlist_Hi.size() - 1; i >= 0; i--) { - for (int j = Statlist_Lo.size() - 1; j >= 0; j--) { - if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) { - if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, Mods, - Statlist_Hi[i].first, RootOp, TII, MRI)) - return {Statlist_Hi[i].first, Mods}; - } + for (int i = StatlistHi.size() - 1; i >= 0; i--) { + for (int j = StatlistLo.size() - 1; j >= 0; j--) { + if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) && + isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods, + StatlistHi[i].first, RootOp, TII, MRI)) + return {StatlistHi[i].first, Mods}; } } // Packed instructions do not have abs modifiers. @@ -4596,13 +4593,15 @@ int64_t getAllKindImm(const MachineOperand *Op) { return Op->getCImm()->getSExtValue(); case MachineOperand::MachineOperandType::MO_FPImmediate: return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); + default: + llvm_unreachable("not an imm type"); } - llvm_unreachable("not an imm type"); } -bool checkRB(const MachineOperand *Op, int RBNo, - const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) { +static bool checkRB(const MachineOperand *Op, unsigned int RBNo, + const AMDGPURegisterBankInfo &RBI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI); return RB->getID() == RBNo; } @@ -4619,17 +4618,18 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, MachineInstr *MI = MRI.getVRegDef(RootOp->getReg()); if (MI->getOpcode() == AMDGPU::COPY && - isSameOperand(NewOp, &MI->getOperand(1))) + isSameOperand(NewOp, &MI->getOperand(1))) { // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp return RootOp; + } MachineBasicBlock *BB = MI->getParent(); const TargetRegisterClass *DstRC = TRI.getConstrainedRegClassForOperand(*RootOp, MRI); - Register dstReg = MRI.createVirtualRegister(DstRC); + Register DstReg = MRI.createVirtualRegister(DstRC); MachineInstrBuilder MIB = - BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg) + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) .addReg(NewOp->getReg()); // only accept VGPR From 136da478585e4963f081a991b5dcfab55c884879 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 18 Mar 2025 17:20:56 +0800 Subject: [PATCH 11/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 95 ++++++++++--------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 48d5b8b4cdfca..e63abe667842f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4310,7 +4310,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { }}; } -enum SrcStatus { +enum class SrcStatus { IS_SAME, IS_UPPER_HALF, IS_LOWER_HALF, @@ -4372,18 +4372,18 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, SrcStatus getNegStatus(SrcStatus S) { switch (S) { - case IS_SAME: - return IS_NEG; - case IS_UPPER_HALF: - return IS_UPPER_HALF_NEG; - case IS_LOWER_HALF: - return IS_LOWER_HALF_NEG; - case IS_NEG: - return IS_SAME; - case IS_UPPER_HALF_NEG: - return IS_UPPER_HALF; - case IS_LOWER_HALF_NEG: - return IS_LOWER_HALF; + case SrcStatus::IS_SAME: + return SrcStatus::IS_NEG; + case SrcStatus::IS_UPPER_HALF: + return SrcStatus::IS_UPPER_HALF_NEG; + case SrcStatus::IS_LOWER_HALF: + return SrcStatus::IS_LOWER_HALF_NEG; + case SrcStatus::IS_NEG: + return SrcStatus::IS_SAME; + case SrcStatus::IS_UPPER_HALF_NEG: + return SrcStatus::IS_UPPER_HALF; + case SrcStatus::IS_LOWER_HALF_NEG: + return SrcStatus::IS_LOWER_HALF; } llvm_unreachable("unexpected SrcStatus"); } @@ -4405,7 +4405,7 @@ static bool calcNextStatus(std::pair &Curr, unsigned Opc = MI->getOpcode(); - // Handle general Opc cases + // Handle general Opc cases. switch (Opc) { case AMDGPU::G_BITCAST: case AMDGPU::G_CONSTANT: @@ -4413,35 +4413,38 @@ static bool calcNextStatus(std::pair &Curr, case AMDGPU::COPY: return retOpStat(&MI->getOperand(1), Curr.second, Curr); case AMDGPU::G_FNEG: - // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr); + default: + break; } - // Calc next Stat from current Stat + // Calc next Stat from current Stat. switch (Curr.second) { - case IS_SAME: + case SrcStatus::IS_SAME: if (isTruncHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr); break; - case IS_NEG: + case SrcStatus::IS_NEG: if (isTruncHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr); break; - case IS_UPPER_HALF: + case SrcStatus::IS_UPPER_HALF: if (isShlHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr); break; - case IS_LOWER_HALF: + case SrcStatus::IS_LOWER_HALF: if (isLshrHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF, Curr); break; - case IS_UPPER_HALF_NEG: + case SrcStatus::IS_UPPER_HALF_NEG: if (isShlHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr); break; - case IS_LOWER_HALF_NEG: + case SrcStatus::IS_LOWER_HALF_NEG: if (isLshrHalf(MI, MRI)) - return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, Curr); + return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF_NEG, Curr); + break; + default: break; } return false; @@ -4451,13 +4454,13 @@ SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool OnlyLastSameOrNeg = false, int MaxDepth = 6) { int Depth = 0; - std::pair Curr = {Op, IS_SAME}; + std::pair Curr = {Op, SrcStatus::IS_SAME}; SmallVector> Statlist; while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { Depth++; - if ((OnlyLastSameOrNeg && - (Curr.second != IS_SAME && Curr.second != IS_NEG))) + if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME && + Curr.second != SrcStatus::IS_NEG))) break; if (!OnlyLastSameOrNeg) @@ -4496,35 +4499,35 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, const MachineRegisterInfo &MRI) { if (NewOp->isReg()) { if (isSameBitWidth(NewOp, RootOp, MRI)) { - // IS_LOWER_HALF remain 0 - if (HiStat == IS_UPPER_HALF_NEG) { + // SrcStatus::IS_LOWER_HALF remain 0. + if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { Mods ^= SISrcMods::NEG_HI; Mods |= SISrcMods::OP_SEL_1; - } else if (HiStat == IS_UPPER_HALF) { + } else if (HiStat == SrcStatus::IS_UPPER_HALF) { Mods |= SISrcMods::OP_SEL_1; - } else if (HiStat == IS_LOWER_HALF_NEG) { + } else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) { Mods ^= SISrcMods::NEG_HI; } - if (LoStat == IS_UPPER_HALF_NEG) { + if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { Mods ^= SISrcMods::NEG; Mods |= SISrcMods::OP_SEL_0; - } else if (LoStat == IS_UPPER_HALF) { + } else if (LoStat == SrcStatus::IS_UPPER_HALF) { Mods |= SISrcMods::OP_SEL_0; - } else if (LoStat == IS_UPPER_HALF_NEG) { + } else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { Mods |= SISrcMods::NEG; } return true; } } else { - if ((HiStat == IS_SAME || HiStat == IS_NEG) && - (LoStat == IS_SAME || LoStat == IS_NEG) && + if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_NEG) && + (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_NEG) && isInlinableConstant(*NewOp, TII)) { - if (HiStat == IS_NEG) + if (HiStat == SrcStatus::IS_NEG) Mods ^= SISrcMods::NEG_HI; - if (LoStat == IS_NEG) + if (LoStat == SrcStatus::IS_NEG) Mods ^= SISrcMods::NEG; // opsel = opsel_hi = 0, since the upper half and lower half both - // the same as the target inlinable constant + // the same as the target inlinable constant. return true; } } @@ -4543,7 +4546,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - if (Stat.second == IS_NEG) + if (Stat.second == SrcStatus::IS_NEG) Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Op = Stat.first; @@ -4611,7 +4614,7 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { // RootOp can only be VGPR or SGPR (some hand written cases such as - // inst-select-ashr.v2s16.mir::ashr_v2s16_vs) + // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) return NewOp; @@ -4619,7 +4622,7 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, MachineInstr *MI = MRI.getVRegDef(RootOp->getReg()); if (MI->getOpcode() == AMDGPU::COPY && isSameOperand(NewOp, &MI->getOperand(1))) { - // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp + // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp. return RootOp; } From fc7c927a4b56afb1cc3a1431ba87cbd185242aee Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 24 Mar 2025 10:57:20 +0800 Subject: [PATCH 12/20] Block for root type other than 2 x Type --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 206 ++++++++++++++++-- .../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 3 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 6 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 18 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll | 12 +- 8 files changed, 237 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 0a5d0dede02e0..cb34986d8c77e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4318,9 +4318,12 @@ enum class SrcStatus { IS_SAME, IS_UPPER_HALF, IS_LOWER_HALF, - IS_NEG, + IS_HI_NEG, + IS_LO_NEG, + IS_BOTH_NEG, IS_UPPER_HALF_NEG, - IS_LOWER_HALF_NEG + IS_LOWER_HALF_NEG, + INVALID }; static bool isTruncHalf(const MachineInstr *MI, @@ -4365,8 +4368,9 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, std::pair &Curr) { - if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || - Op->isCImm() || Op->isFPImm()) { + if (Stat != SrcStatus::INVALID && + ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || + Op->isCImm() || Op->isFPImm())) { Curr = {Op, Stat}; return true; } @@ -4374,20 +4378,164 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, return false; } -SrcStatus getNegStatus(SrcStatus S) { +// 0 = Vector of 2, +// 1 = Scalar +// -1 = non of them +static int isVectorOfTwoOrScalar(const MachineOperand *Op, + const MachineRegisterInfo &MRI) { + if (!Op->isReg() || Op->getReg().isPhysical()) + return -1; + LLT OpTy = MRI.getType(Op->getReg()); + if (OpTy.isScalar()) + return 1; + if (OpTy.isVector() && OpTy.getNumElements() == 2) + return 0; + return -1; +} + +SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, + const MachineRegisterInfo &MRI) { + int NegType = isVectorOfTwoOrScalar(Op, MRI); + if (NegType != 0 && NegType != 1) + return SrcStatus::INVALID; + switch (S) { case SrcStatus::IS_SAME: - return SrcStatus::IS_NEG; + if (NegType == 0) { + // Vector of 2: + // [SrcHi, SrcLo] = [CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-OpHi, -OpLo] + return SrcStatus::IS_BOTH_NEG; + } else if (NegType == 1) { + // Scalar: + // [SrcHi, SrcLo] = [CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-OpHi, OpLo] + return SrcStatus::IS_HI_NEG; + } + break; + case SrcStatus::IS_HI_NEG: + if (NegType == 0) { + // Vector of 2: + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo] + return SrcStatus::IS_LO_NEG; + } else if (NegType == 1) { + // Scalar: + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo] + return SrcStatus::IS_SAME; + } + break; + case SrcStatus::IS_LO_NEG: + if (NegType == 0) { + // Vector of 2: + // [SrcHi, SrcLo] = [CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo] + return SrcStatus::IS_HI_NEG; + } else if (NegType == 1) { + // Scalar: + // [SrcHi, SrcLo] = [CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [-OpHi, -OpLo] + return SrcStatus::IS_BOTH_NEG; + } + break; + case SrcStatus::IS_BOTH_NEG: + if (NegType == 0) { + // Vector of 2: + // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) + // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) + // [SrcHi, SrcLo] = [OpHi, OpLo] + return SrcStatus::IS_SAME; + } else if (NegType == 1) { + // Scalar: + // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] + // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) + // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) + // [SrcHi, SrcLo] = [OpHi, -OpLo] + return SrcStatus::IS_LO_NEG; + } + break; case SrcStatus::IS_UPPER_HALF: + // Vector of 2: + // Src = CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -OpUpper + // + // Scalar: + // Src = CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -OpUpper return SrcStatus::IS_UPPER_HALF_NEG; case SrcStatus::IS_LOWER_HALF: - return SrcStatus::IS_LOWER_HALF_NEG; - case SrcStatus::IS_NEG: - return SrcStatus::IS_SAME; + if (NegType == 0) { + // Vector of 2: + // Src = CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -OpLower + return SrcStatus::IS_LOWER_HALF_NEG; + } else if (NegType == 1) { + // Scalar: + // Src = CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = OpLower + return SrcStatus::IS_LOWER_HALF; + } + break; case SrcStatus::IS_UPPER_HALF_NEG: + // Vector of 2: + // Src = -CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -(-OpUpper) = OpUpper + // + // Scalar: + // Src = -CurrUpper + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -(-OpUpper) = OpUpper return SrcStatus::IS_UPPER_HALF; case SrcStatus::IS_LOWER_HALF_NEG: - return SrcStatus::IS_LOWER_HALF; + if (NegType == 0) { + // Vector of 2: + // Src = -CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) + // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) + // Src = -(-OpLower) = OpLower + return SrcStatus::IS_LOWER_HALF_NEG; + } else if (NegType == 1) { + // Scalar: + // Src = -CurrLower + // Curr = [CurrUpper, CurrLower] + // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) + // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) + // Src = -OpLower + return SrcStatus::IS_LOWER_HALF; + } + break; } llvm_unreachable("unexpected SrcStatus"); } @@ -4417,7 +4565,8 @@ static bool calcNextStatus(std::pair &Curr, case AMDGPU::COPY: return retOpStat(&MI->getOperand(1), Curr.second, Curr); case AMDGPU::G_FNEG: - return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr); + return retOpStat(&MI->getOperand(1), + getNegStatus(Curr.first, Curr.second, MRI), Curr); default: break; } @@ -4428,9 +4577,16 @@ static bool calcNextStatus(std::pair &Curr, if (isTruncHalf(MI, MRI)) return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr); break; - case SrcStatus::IS_NEG: - if (isTruncHalf(MI, MRI)) + case SrcStatus::IS_HI_NEG: + if (isTruncHalf(MI, MRI)) { + // [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower + // = [OpLowerHi, OpLowerLo] + // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo] + // = [-OpLowerHi, OpLowerLo] + // = -OpLower return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr); + } break; case SrcStatus::IS_UPPER_HALF: if (isShlHalf(MI, MRI)) @@ -4464,7 +4620,9 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { Depth++; if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME && - Curr.second != SrcStatus::IS_NEG))) + Curr.second != SrcStatus::IS_HI_NEG && + Curr.second != SrcStatus::IS_LO_NEG && + Curr.second != SrcStatus::IS_BOTH_NEG))) break; if (!OnlyLastSameOrNeg) @@ -4523,12 +4681,12 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, return true; } } else { - if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_NEG) && - (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_NEG) && + if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && + (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && isInlinableConstant(*NewOp, TII)) { - if (HiStat == SrcStatus::IS_NEG) + if (HiStat == SrcStatus::IS_HI_NEG) Mods ^= SISrcMods::NEG_HI; - if (LoStat == SrcStatus::IS_NEG) + if (LoStat == SrcStatus::IS_HI_NEG) Mods ^= SISrcMods::NEG; // opsel = opsel_hi = 0, since the upper half and lower half both // the same as the target inlinable constant. @@ -4543,6 +4701,12 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; + // No modification if Root type is not form of <2 x Type> + if (isVectorOfTwoOrScalar(Op, MRI) != 0) { + Mods |= SISrcMods::OP_SEL_1; + return {Op, Mods}; + } + const MachineOperand *RootOp = Op; std::pair Stat = getSrcStats(Op, MRI, true)[0]; @@ -4550,8 +4714,12 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - if (Stat.second == SrcStatus::IS_NEG) + if (Stat.second == SrcStatus::IS_BOTH_NEG) Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + else if (Stat.second == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG_HI; + else if (Stat.second == SrcStatus::IS_LO_NEG) + Mods ^= SISrcMods::NEG; Op = Stat.first; MachineInstr *MI = MRI.getVRegDef(Op->getReg()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll index 2243c57cf37ac..1d9514c58ab9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -68,7 +68,8 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index edb09e0ad646b..8f0ae8c47098a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -248,7 +248,8 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX906-LABEL: v_sdot2_fnegf32_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegf32_c: @@ -262,7 +263,8 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX10-LABEL: v_sdot2_fnegf32_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %cast.neg.c = bitcast float %neg.c to i32 @@ -274,7 +276,8 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX906-LABEL: v_sdot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegv2f16_c: @@ -288,7 +291,8 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX10-LABEL: v_sdot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index d6ef48e25cafb..06560afee3c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -91,7 +91,8 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot4_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegf32_a: @@ -111,7 +112,8 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegv2f16_a: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll index d2aa47df81cbe..0d729351f65a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll @@ -47,13 +47,15 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot8_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegf32_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -65,13 +67,15 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_sdot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index b7cf49bbfab20..287a009ca1405 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -235,19 +235,22 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { ; GFX906-LABEL: v_udot2_fnegf32_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegf32_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegf32_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %cast.neg.c = bitcast float %neg.c to i32 @@ -259,19 +262,22 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { ; GFX906-LABEL: v_udot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index 7ad0404942feb..b14af9e043e09 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -112,13 +112,15 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot4_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot4_fnegf32_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -130,13 +132,15 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll index 52763bbc24e40..a664c8aa508ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll @@ -48,13 +48,15 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot8_fnegf32_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot8_fnegf32_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a %cast.neg.a = bitcast float %neg.a to i32 @@ -66,13 +68,15 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX906-LABEL: v_udot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 From bc51bf48144423540958effc4a7487d735787add Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Mon, 24 Mar 2025 18:03:43 +0800 Subject: [PATCH 13/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 85 +++++++-- .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll | 163 ++++++++++++++++++ 2 files changed, 229 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index cb34986d8c77e..f43203f437484 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4318,12 +4318,16 @@ enum class SrcStatus { IS_SAME, IS_UPPER_HALF, IS_LOWER_HALF, + IS_UPPER_HALF_NEG, + IS_LOWER_HALF_NEG, IS_HI_NEG, IS_LO_NEG, IS_BOTH_NEG, - IS_UPPER_HALF_NEG, - IS_LOWER_HALF_NEG, - INVALID + INVALID, + NEG_START = IS_UPPER_HALF_NEG, + NEG_END = IS_BOTH_NEG, + HALF_START = IS_UPPER_HALF, + HALF_END = IS_LOWER_HALF_NEG }; static bool isTruncHalf(const MachineInstr *MI, @@ -4525,7 +4529,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) // Src = -(-OpLower) = OpLower - return SrcStatus::IS_LOWER_HALF_NEG; + return SrcStatus::IS_LOWER_HALF; } else if (NegType == 1) { // Scalar: // Src = -CurrLower @@ -4533,7 +4537,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) // Src = -OpLower - return SrcStatus::IS_LOWER_HALF; + return SrcStatus::IS_LOWER_HALF_NEG; } break; } @@ -4610,26 +4614,69 @@ static bool calcNextStatus(std::pair &Curr, return false; } +struct { + unsigned int HasNeg : 1; + unsigned int HasOpsel : 1; +} StatOptions; + +static bool checkOptions(SrcStatus Stat) { + if (!StatOptions.HasNeg && + (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) { + return false; + } + if (!StatOptions.HasOpsel && + (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) { + return false; + } + return true; +} + +void setUpOptions(const MachineOperand *RootOp, + const MachineRegisterInfo &MRI) { + const MachineInstr *MI = RootOp->getParent(); + unsigned Opc = MI->getOpcode(); + + if (Opc < TargetOpcode::GENERIC_OP_END) { + // Keep same for gerneric op + StatOptions.HasNeg = 1; + } else if (Opc == TargetOpcode::G_INTRINSIC) { + Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); + // Only float point intrinsic has neg & neg_hi bits + if (IntrinsicID == Intrinsic::amdgcn_fdot2) + StatOptions.HasNeg = 1; + else + StatOptions.HasNeg = 0; + } else + StatOptions.HasNeg = 0; + + // Assume all complex pattern of VOP3P has opsel + StatOptions.HasOpsel = 1; +} + SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool OnlyLastSameOrNeg = false, int MaxDepth = 6) { int Depth = 0; std::pair Curr = {Op, SrcStatus::IS_SAME}; - SmallVector> Statlist; + SmallVector, 4> Statlist; + + if (OnlyLastSameOrNeg) + Statlist.push_back(Curr); while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { Depth++; - if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME && - Curr.second != SrcStatus::IS_HI_NEG && - Curr.second != SrcStatus::IS_LO_NEG && - Curr.second != SrcStatus::IS_BOTH_NEG))) - break; - - if (!OnlyLastSameOrNeg) - Statlist.push_back(Curr); + if (checkOptions(Curr.second)) { + if (OnlyLastSameOrNeg && (Curr.second == SrcStatus::IS_SAME || + Curr.second == SrcStatus::IS_HI_NEG || + Curr.second == SrcStatus::IS_LO_NEG || + Curr.second == SrcStatus::IS_BOTH_NEG)) + Statlist[0] = Curr; + + if (!OnlyLastSameOrNeg) + Statlist.push_back(Curr); + } } - if (OnlyLastSameOrNeg) - Statlist.push_back(Curr); + return Statlist; } @@ -4648,9 +4695,7 @@ static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, static bool isSameOperand(const MachineOperand *Op1, const MachineOperand *Op2) { if (Op1->isReg()) { - if (Op2->isReg()) - return Op1->getReg() == Op2->getReg(); - return false; + return Op2->isReg() && Op1->getReg() == Op2->getReg(); } return Op1->isIdenticalTo(*Op2); } @@ -4707,6 +4752,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, return {Op, Mods}; } + setUpOptions(Op, MRI); + const MachineOperand *RootOp = Op; std::pair Stat = getSrcStats(Op, MRI, true)[0]; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index 543f8e413abd8..9f0641b715d36 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -106,6 +106,169 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) ret <2 x half> %mul } +define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16_partial_neg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16_partial_neg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16_partial_neg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %mul1 = fmul <2 x half> %b3, %a + %mul2 = fmul <2 x half> %b4, %mul1 + ret <2 x half> %mul2 +} + +define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_hi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = extractelement <2 x half> %b3, i64 1 + %tmp = insertelement <2 x half> poison, half %b4, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_hi1(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_hi1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_hi1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_hi1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %b5 = extractelement <2 x half> %b4, i64 1 + %tmp = insertelement <2 x half> poison, half %b5, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_lo(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_lo: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = fneg <2 x half> %b3 + %b5 = extractelement <2 x half> %b4, i64 0 + %tmp = insertelement <2 x half> poison, half %b5, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + +define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 { +; GFX9-LABEL: fmul_v2_half_neg_lo1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fmul_v2_half_neg_lo1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_v2_half_neg_lo1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %b1 = bitcast <2 x half> %b to float + %b2 = fneg float %b1 + %b3 = bitcast float %b2 to <2 x half> + %b4 = extractelement <2 x half> %b3, i64 0 + %tmp = insertelement <2 x half> poison, half %b4, i64 0 + %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> + %mul = fmul <2 x half> %a, %k + ret <2 x half> %mul +} + define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { ; GFX9-LABEL: v_fmul_v3f16: ; GFX9: ; %bb.0: From 47840d7832636f2d393d9d848ff86aaafd3a8dc7 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 26 Mar 2025 10:37:45 +0800 Subject: [PATCH 14/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 194 ++++++++++-------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index f43203f437484..c6bd40dc51fcb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4382,37 +4382,36 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, return false; } -// 0 = Vector of 2, -// 1 = Scalar -// -1 = non of them -static int isVectorOfTwoOrScalar(const MachineOperand *Op, - const MachineRegisterInfo &MRI) { +enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED }; + +static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op, + const MachineRegisterInfo &MRI) { if (!Op->isReg() || Op->getReg().isPhysical()) - return -1; + return TypeClass::NON_OF_LISTED; LLT OpTy = MRI.getType(Op->getReg()); if (OpTy.isScalar()) - return 1; + return TypeClass::SCALAR; if (OpTy.isVector() && OpTy.getNumElements() == 2) - return 0; - return -1; + return TypeClass::VECTOR_OF_TWO; + return TypeClass::NON_OF_LISTED; } SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, const MachineRegisterInfo &MRI) { - int NegType = isVectorOfTwoOrScalar(Op, MRI); - if (NegType != 0 && NegType != 1) + TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI); + if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR) return SrcStatus::INVALID; switch (S) { case SrcStatus::IS_SAME: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // [SrcHi, SrcLo] = [CurrHi, CurrLo] // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) // [SrcHi, SrcLo] = [-OpHi, -OpLo] return SrcStatus::IS_BOTH_NEG; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // [SrcHi, SrcLo] = [CurrHi, CurrLo] // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) @@ -4422,14 +4421,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, } break; case SrcStatus::IS_HI_NEG: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // [SrcHi, SrcLo] = [-CurrHi, CurrLo] // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo] return SrcStatus::IS_LO_NEG; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // [SrcHi, SrcLo] = [-CurrHi, CurrLo] // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) @@ -4439,14 +4438,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, } break; case SrcStatus::IS_LO_NEG: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // [SrcHi, SrcLo] = [CurrHi, -CurrLo] // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo] return SrcStatus::IS_HI_NEG; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // [SrcHi, SrcLo] = [CurrHi, -CurrLo] // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) @@ -4456,14 +4455,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, } break; case SrcStatus::IS_BOTH_NEG: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) // [SrcHi, SrcLo] = [OpHi, OpLo] return SrcStatus::IS_SAME; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) @@ -4488,7 +4487,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // Src = -OpUpper return SrcStatus::IS_UPPER_HALF_NEG; case SrcStatus::IS_LOWER_HALF: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // Src = CurrLower // Curr = [CurrUpper, CurrLower] @@ -4496,7 +4495,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) // Src = -OpLower return SrcStatus::IS_LOWER_HALF_NEG; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // Src = CurrLower // Curr = [CurrUpper, CurrLower] @@ -4522,7 +4521,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // Src = -(-OpUpper) = OpUpper return SrcStatus::IS_UPPER_HALF; case SrcStatus::IS_LOWER_HALF_NEG: - if (NegType == 0) { + if (NegType == TypeClass::VECTOR_OF_TWO) { // Vector of 2: // Src = -CurrLower // Curr = [CurrUpper, CurrLower] @@ -4530,7 +4529,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) // Src = -(-OpLower) = OpLower return SrcStatus::IS_LOWER_HALF; - } else if (NegType == 1) { + } else if (NegType == TypeClass::SCALAR) { // Scalar: // Src = -CurrLower // Curr = [CurrUpper, CurrLower] @@ -4615,8 +4614,8 @@ static bool calcNextStatus(std::pair &Curr, } struct { - unsigned int HasNeg : 1; - unsigned int HasOpsel : 1; + bool HasNeg; + bool HasOpsel; } StatOptions; static bool checkOptions(SrcStatus Stat) { @@ -4653,33 +4652,44 @@ void setUpOptions(const MachineOperand *RootOp, StatOptions.HasOpsel = 1; } -SmallVector> +static SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, - bool OnlyLastSameOrNeg = false, int MaxDepth = 6) { + int MaxDepth = 6) { int Depth = 0; std::pair Curr = {Op, SrcStatus::IS_SAME}; SmallVector, 4> Statlist; - if (OnlyLastSameOrNeg) - Statlist.push_back(Curr); - while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { Depth++; if (checkOptions(Curr.second)) { - if (OnlyLastSameOrNeg && (Curr.second == SrcStatus::IS_SAME || - Curr.second == SrcStatus::IS_HI_NEG || - Curr.second == SrcStatus::IS_LO_NEG || - Curr.second == SrcStatus::IS_BOTH_NEG)) - Statlist[0] = Curr; - - if (!OnlyLastSameOrNeg) - Statlist.push_back(Curr); + Statlist.push_back(Curr); } } return Statlist; } +static std::pair +getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, + int MaxDepth = 6) { + int Depth = 0; + std::pair Curr = {Op, SrcStatus::IS_SAME}; + std::pair LastSameOrNeg = Curr; + + while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { + Depth++; + if (checkOptions(Curr.second)) { + if (Curr.second == SrcStatus::IS_SAME || + Curr.second == SrcStatus::IS_HI_NEG || + Curr.second == SrcStatus::IS_LO_NEG || + Curr.second == SrcStatus::IS_BOTH_NEG) + LastSameOrNeg = Curr; + } + } + + return LastSameOrNeg; +} + static bool isInlinableConstant(const MachineOperand &Op, const SIInstrInfo &TII) { return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF()); @@ -4700,41 +4710,51 @@ static bool isSameOperand(const MachineOperand *Op1, return Op1->isIdenticalTo(*Op2); } +unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { + // SrcStatus::IS_LOWER_HALF remain 0. + if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG_HI; + Mods |= SISrcMods::OP_SEL_1; + } else if (HiStat == SrcStatus::IS_UPPER_HALF) + Mods |= SISrcMods::OP_SEL_1; + else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) + Mods ^= SISrcMods::NEG_HI; + else if (HiStat == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG_HI; + + if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { + Mods ^= SISrcMods::NEG; + Mods |= SISrcMods::OP_SEL_0; + } else if (LoStat == SrcStatus::IS_UPPER_HALF) + Mods |= SISrcMods::OP_SEL_0; + else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) + Mods |= SISrcMods::NEG; + else if (LoStat == SrcStatus::IS_HI_NEG) + Mods ^= SISrcMods::NEG; + + return Mods; +} + static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods, const MachineOperand *NewOp, const MachineOperand *RootOp, const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { if (NewOp->isReg()) { - if (isSameBitWidth(NewOp, RootOp, MRI)) { - // SrcStatus::IS_LOWER_HALF remain 0. - if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { - Mods ^= SISrcMods::NEG_HI; - Mods |= SISrcMods::OP_SEL_1; - } else if (HiStat == SrcStatus::IS_UPPER_HALF) { - Mods |= SISrcMods::OP_SEL_1; - } else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) { - Mods ^= SISrcMods::NEG_HI; - } - if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { - Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_0; - } else if (LoStat == SrcStatus::IS_UPPER_HALF) { - Mods |= SISrcMods::OP_SEL_0; - } else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { - Mods |= SISrcMods::NEG; - } + if (isSameBitWidth(NewOp, RootOp, MRI) && + (HiStat == SrcStatus::IS_UPPER_HALF || + HiStat == SrcStatus::IS_UPPER_HALF_NEG || + HiStat == SrcStatus::IS_LOWER_HALF || + HiStat == SrcStatus::IS_LOWER_HALF_NEG) && + (LoStat == SrcStatus::IS_UPPER_HALF || + LoStat == SrcStatus::IS_UPPER_HALF_NEG || + LoStat == SrcStatus::IS_LOWER_HALF || + LoStat == SrcStatus::IS_LOWER_HALF_NEG)) { return true; } } else { if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && isInlinableConstant(*NewOp, TII)) { - if (HiStat == SrcStatus::IS_HI_NEG) - Mods ^= SISrcMods::NEG_HI; - if (LoStat == SrcStatus::IS_HI_NEG) - Mods ^= SISrcMods::NEG; - // opsel = opsel_hi = 0, since the upper half and lower half both - // the same as the target inlinable constant. return true; } } @@ -4742,21 +4762,20 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, } std::pair -AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, +AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; + const MachineOperand *Op = RootOp; // No modification if Root type is not form of <2 x Type> - if (isVectorOfTwoOrScalar(Op, MRI) != 0) { + if (isVectorOfTwoOrScalar(Op, MRI) != TypeClass::VECTOR_OF_TWO) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } setUpOptions(Op, MRI); - const MachineOperand *RootOp = Op; - std::pair Stat = - getSrcStats(Op, MRI, true)[0]; + std::pair Stat = getLastSameOrNeg(Op, MRI); if (!Stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4777,16 +4796,16 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, return {Op, Mods}; } - SmallVector> StatlistHi; - StatlistHi = getSrcStats(&MI->getOperand(2), MRI); + SmallVector> StatlistHi = + getSrcStats(&MI->getOperand(2), MRI); if (StatlistHi.size() == 0) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; } - SmallVector> StatlistLo; - StatlistLo = getSrcStats(&MI->getOperand(1), MRI); + SmallVector> StatlistLo = + getSrcStats(&MI->getOperand(1), MRI); if (StatlistLo.size() == 0) { Mods |= SISrcMods::OP_SEL_1; @@ -4798,7 +4817,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op, if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) && isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods, StatlistHi[i].first, RootOp, TII, MRI)) - return {StatlistHi[i].first, Mods}; + return {StatlistHi[i].first, + updateMods(StatlistHi[i].second, StatlistLo[j].second, Mods)}; } } // Packed instructions do not have abs modifiers. @@ -4863,18 +4883,17 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair Res = - selectVOP3PModsImpl(&Root, MRI); - if (!(Res.first->isReg())) + auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI); + if (!(Op->isReg())) return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; - Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII); + Op = getVReg(Op, &Root, RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } @@ -4883,18 +4902,17 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getParent()->getParent()->getRegInfo(); - std::pair Res = - selectVOP3PModsImpl(&Root, MRI, true); - if (!(Res.first->isReg())) + auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, true); + if (!(Op->isReg())) return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; - Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII); + Op = getVReg(Op, &Root, RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } From 6fe41472c046a58e7a55cd84a49ad64475ec9987 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 26 Mar 2025 11:15:54 +0800 Subject: [PATCH 15/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c6bd40dc51fcb..18b8bdd54c612 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4370,16 +4370,17 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { return false; } -static bool retOpStat(const MachineOperand *Op, SrcStatus Stat, - std::pair &Curr) { +std::optional> +retOpStat(const MachineOperand *Op, SrcStatus Stat, + std::pair &Curr) { if (Stat != SrcStatus::INVALID && ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() || Op->isCImm() || Op->isFPImm())) { - Curr = {Op, Stat}; - return true; + return std::optional>( + {Op, Stat}); } - return false; + return std::nullopt; } enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED }; @@ -4543,10 +4544,11 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, llvm_unreachable("unexpected SrcStatus"); } -static bool calcNextStatus(std::pair &Curr, - const MachineRegisterInfo &MRI) { +std::optional> +calcNextStatus(std::pair Curr, + const MachineRegisterInfo &MRI) { if (!Curr.first->isReg()) - return false; + return std::nullopt; const MachineInstr *MI = nullptr; @@ -4556,7 +4558,7 @@ static bool calcNextStatus(std::pair &Curr, MI = Curr.first->getParent(); if (!MI) - return false; + return std::nullopt; unsigned Opc = MI->getOpcode(); @@ -4610,7 +4612,7 @@ static bool calcNextStatus(std::pair &Curr, default: break; } - return false; + return std::nullopt; } struct { @@ -4656,14 +4658,15 @@ static SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, int MaxDepth = 6) { int Depth = 0; - std::pair Curr = {Op, SrcStatus::IS_SAME}; + auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI); SmallVector, 4> Statlist; - while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { + while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (checkOptions(Curr.second)) { - Statlist.push_back(Curr); + if (checkOptions(Curr.value().second)) { + Statlist.push_back(Curr.value()); } + Curr = calcNextStatus(Curr.value(), MRI); } return Statlist; @@ -4673,25 +4676,27 @@ static std::pair getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, int MaxDepth = 6) { int Depth = 0; - std::pair Curr = {Op, SrcStatus::IS_SAME}; - std::pair LastSameOrNeg = Curr; + std::pair LastSameOrNeg = { + Op, SrcStatus::IS_SAME}; + auto Curr = calcNextStatus(LastSameOrNeg, MRI); - while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) { + while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (checkOptions(Curr.second)) { - if (Curr.second == SrcStatus::IS_SAME || - Curr.second == SrcStatus::IS_HI_NEG || - Curr.second == SrcStatus::IS_LO_NEG || - Curr.second == SrcStatus::IS_BOTH_NEG) - LastSameOrNeg = Curr; + if (checkOptions(Curr.value().second)) { + if (Curr.value().second == SrcStatus::IS_SAME || + Curr.value().second == SrcStatus::IS_HI_NEG || + Curr.value().second == SrcStatus::IS_LO_NEG || + Curr.value().second == SrcStatus::IS_BOTH_NEG) + LastSameOrNeg = Curr.value(); } + Curr = calcNextStatus(Curr.value(), MRI); } return LastSameOrNeg; } -static bool isInlinableConstant(const MachineOperand &Op, - const SIInstrInfo &TII) { +static bool isInlinableFPConstant(const MachineOperand &Op, + const SIInstrInfo &TII) { return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF()); } @@ -4754,7 +4759,7 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, } else { if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && - isInlinableConstant(*NewOp, TII)) { + isInlinableFPConstant(*NewOp, TII)) { return true; } } From 3b7f377d17525647ab0583cfec56eb95514ca14e Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 26 Mar 2025 20:24:56 +0800 Subject: [PATCH 16/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 60 +++++++++---------- llvm/test/lit.cfg.py | 2 +- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 18b8bdd54c612..ee99276be907e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4370,7 +4370,7 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { return false; } -std::optional> +static std::optional> retOpStat(const MachineOperand *Op, SrcStatus Stat, std::pair &Curr) { if (Stat != SrcStatus::INVALID && @@ -4397,8 +4397,8 @@ static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op, return TypeClass::NON_OF_LISTED; } -SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, - const MachineRegisterInfo &MRI) { +static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, + const MachineRegisterInfo &MRI) { TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI); if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR) return SrcStatus::INVALID; @@ -4544,7 +4544,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, llvm_unreachable("unexpected SrcStatus"); } -std::optional> +static std::optional> calcNextStatus(std::pair Curr, const MachineRegisterInfo &MRI) { if (!Curr.first->isReg()) @@ -4632,26 +4632,26 @@ static bool checkOptions(SrcStatus Stat) { return true; } -void setUpOptions(const MachineOperand *RootOp, - const MachineRegisterInfo &MRI) { +static void setUpOptions(const MachineOperand *RootOp, + const MachineRegisterInfo &MRI) { const MachineInstr *MI = RootOp->getParent(); unsigned Opc = MI->getOpcode(); if (Opc < TargetOpcode::GENERIC_OP_END) { // Keep same for gerneric op - StatOptions.HasNeg = 1; + StatOptions.HasNeg = true; } else if (Opc == TargetOpcode::G_INTRINSIC) { Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); // Only float point intrinsic has neg & neg_hi bits if (IntrinsicID == Intrinsic::amdgcn_fdot2) - StatOptions.HasNeg = 1; + StatOptions.HasNeg = true; else - StatOptions.HasNeg = 0; + StatOptions.HasNeg = false; } else - StatOptions.HasNeg = 0; + StatOptions.HasNeg = false; // Assume all complex pattern of VOP3P has opsel - StatOptions.HasOpsel = 1; + StatOptions.HasOpsel = true; } static SmallVector> @@ -4715,7 +4715,7 @@ static bool isSameOperand(const MachineOperand *Op1, return Op1->isIdenticalTo(*Op2); } -unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { +static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { // SrcStatus::IS_LOWER_HALF remain 0. if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { Mods ^= SISrcMods::NEG_HI; @@ -4732,7 +4732,7 @@ unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { Mods |= SISrcMods::OP_SEL_0; } else if (LoStat == SrcStatus::IS_UPPER_HALF) Mods |= SISrcMods::OP_SEL_0; - else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) + else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG) Mods |= SISrcMods::NEG; else if (LoStat == SrcStatus::IS_HI_NEG) Mods ^= SISrcMods::NEG; @@ -4741,28 +4741,22 @@ unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { } static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, - unsigned int &Mods, const MachineOperand *NewOp, + const MachineOperand *NewOp, const MachineOperand *RootOp, const SIInstrInfo &TII, const MachineRegisterInfo &MRI) { if (NewOp->isReg()) { - if (isSameBitWidth(NewOp, RootOp, MRI) && - (HiStat == SrcStatus::IS_UPPER_HALF || - HiStat == SrcStatus::IS_UPPER_HALF_NEG || - HiStat == SrcStatus::IS_LOWER_HALF || - HiStat == SrcStatus::IS_LOWER_HALF_NEG) && - (LoStat == SrcStatus::IS_UPPER_HALF || - LoStat == SrcStatus::IS_UPPER_HALF_NEG || - LoStat == SrcStatus::IS_LOWER_HALF || - LoStat == SrcStatus::IS_LOWER_HALF_NEG)) { - return true; - } - } else { - if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && - (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && - isInlinableFPConstant(*NewOp, TII)) { - return true; - } - } + auto IsHalfState = [](SrcStatus S) { + return S == SrcStatus::IS_UPPER_HALF || + S == SrcStatus::IS_UPPER_HALF_NEG || + S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG; + }; + return isSameBitWidth(NewOp, RootOp, MRI) && IsHalfState(LoStat) && + IsHalfState(HiStat); + } else + return ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) && + (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) && + isInlinableFPConstant(*NewOp, TII)); + return false; } @@ -4820,7 +4814,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, for (int i = StatlistHi.size() - 1; i >= 0; i--) { for (int j = StatlistLo.size() - 1; j >= 0; j--) { if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) && - isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods, + isValidToPack(StatlistHi[i].second, StatlistLo[j].second, StatlistHi[i].first, RootOp, TII, MRI)) return {StatlistHi[i].first, updateMods(StatlistHi[i].second, StatlistLo[j].second, Mods)}; diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index aad7a088551b2..50921879cd1f2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From d7de92f5b2dd791e34596e4381105dcf88252228 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 26 Mar 2025 20:25:23 +0800 Subject: [PATCH 17/20] fix lit --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 50921879cd1f2..aad7a088551b2 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -466,7 +466,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("utf-8") + readobj_out = readobj_cmd.stdout.read().decode("ascii") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 45ed99410f46c748cd957297cda9c03d07311884 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Wed, 26 Mar 2025 20:45:31 +0800 Subject: [PATCH 18/20] avoid global variable --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ee99276be907e..d619be62194e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4615,55 +4615,53 @@ calcNextStatus(std::pair Curr, return std::nullopt; } -struct { +class statOptions { +private: bool HasNeg; bool HasOpsel; -} StatOptions; -static bool checkOptions(SrcStatus Stat) { - if (!StatOptions.HasNeg && - (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) { - return false; +public: + statOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) { + const MachineInstr *MI = RootOp->getParent(); + unsigned Opc = MI->getOpcode(); + HasNeg = false; + HasOpsel = false; + if (Opc < TargetOpcode::GENERIC_OP_END) { + // Keep same for gerneric op + HasNeg = true; + } else if (Opc == TargetOpcode::G_INTRINSIC) { + Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); + // Only float point intrinsic has neg & neg_hi bits + if (IntrinsicID == Intrinsic::amdgcn_fdot2) + HasNeg = true; + } + + // Assume all complex pattern of VOP3P has opsel + HasOpsel = true; } - if (!StatOptions.HasOpsel && - (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) { - return false; + bool checkOptions(SrcStatus Stat) const { + if (!HasNeg && + (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) { + return false; + } + if (!HasOpsel && + (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) { + return false; + } + return true; } - return true; -} - -static void setUpOptions(const MachineOperand *RootOp, - const MachineRegisterInfo &MRI) { - const MachineInstr *MI = RootOp->getParent(); - unsigned Opc = MI->getOpcode(); - - if (Opc < TargetOpcode::GENERIC_OP_END) { - // Keep same for gerneric op - StatOptions.HasNeg = true; - } else if (Opc == TargetOpcode::G_INTRINSIC) { - Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); - // Only float point intrinsic has neg & neg_hi bits - if (IntrinsicID == Intrinsic::amdgcn_fdot2) - StatOptions.HasNeg = true; - else - StatOptions.HasNeg = false; - } else - StatOptions.HasNeg = false; - - // Assume all complex pattern of VOP3P has opsel - StatOptions.HasOpsel = true; -} +}; static SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, - int MaxDepth = 6) { + statOptions StatOptions, int MaxDepth = 6) { int Depth = 0; auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI); SmallVector, 4> Statlist; while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (checkOptions(Curr.value().second)) { + if (StatOptions.checkOptions(Curr.value().second)) { Statlist.push_back(Curr.value()); } Curr = calcNextStatus(Curr.value(), MRI); @@ -4674,7 +4672,7 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, static std::pair getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, - int MaxDepth = 6) { + statOptions StatOptions, int MaxDepth = 6) { int Depth = 0; std::pair LastSameOrNeg = { Op, SrcStatus::IS_SAME}; @@ -4682,7 +4680,7 @@ getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (checkOptions(Curr.value().second)) { + if (StatOptions.checkOptions(Curr.value().second)) { if (Curr.value().second == SrcStatus::IS_SAME || Curr.value().second == SrcStatus::IS_HI_NEG || Curr.value().second == SrcStatus::IS_LO_NEG || @@ -4772,9 +4770,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, return {Op, Mods}; } - setUpOptions(Op, MRI); + statOptions StatOptions(Op, MRI); - std::pair Stat = getLastSameOrNeg(Op, MRI); + std::pair Stat = + getLastSameOrNeg(Op, MRI, StatOptions); if (!Stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4796,7 +4795,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, } SmallVector> StatlistHi = - getSrcStats(&MI->getOperand(2), MRI); + getSrcStats(&MI->getOperand(2), MRI, StatOptions); if (StatlistHi.size() == 0) { Mods |= SISrcMods::OP_SEL_1; @@ -4804,7 +4803,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, } SmallVector> StatlistLo = - getSrcStats(&MI->getOperand(1), MRI); + getSrcStats(&MI->getOperand(1), MRI, StatOptions); if (StatlistLo.size() == 0) { Mods |= SISrcMods::OP_SEL_1; From a792c1d91bf2e5e88cda8321d674112aea1652af Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 28 Mar 2025 11:28:04 +0800 Subject: [PATCH 19/20] fix comments --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 90 +++++++++---------- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 + 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d619be62194e6..d46f74cb5004d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4319,8 +4319,11 @@ enum class SrcStatus { IS_UPPER_HALF, IS_LOWER_HALF, IS_UPPER_HALF_NEG, + // This means current op = [op_upper, op_lower] and src = -op_lower IS_LOWER_HALF_NEG, IS_HI_NEG, + // This means current op = [op_upper, op_lower] and src = [op_upper, + // -op_lower] IS_LO_NEG, IS_BOTH_NEG, INVALID, @@ -4383,18 +4386,18 @@ retOpStat(const MachineOperand *Op, SrcStatus Stat, return std::nullopt; } -enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED }; +enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED }; static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op, const MachineRegisterInfo &MRI) { if (!Op->isReg() || Op->getReg().isPhysical()) - return TypeClass::NON_OF_LISTED; + return TypeClass::NONE_OF_LISTED; LLT OpTy = MRI.getType(Op->getReg()); if (OpTy.isScalar()) return TypeClass::SCALAR; if (OpTy.isVector() && OpTy.getNumElements() == 2) return TypeClass::VECTOR_OF_TWO; - return TypeClass::NON_OF_LISTED; + return TypeClass::NONE_OF_LISTED; } static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, @@ -4615,19 +4618,19 @@ calcNextStatus(std::pair Curr, return std::nullopt; } -class statOptions { +class searchOptions { private: - bool HasNeg; - bool HasOpsel; + bool HasNeg = false; + // Assume all complex pattern of VOP3P has opsel + bool HasOpsel = true; public: - statOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) { + searchOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) { const MachineInstr *MI = RootOp->getParent(); unsigned Opc = MI->getOpcode(); - HasNeg = false; - HasOpsel = false; + if (Opc < TargetOpcode::GENERIC_OP_END) { - // Keep same for gerneric op + // Keep same for generic op HasNeg = true; } else if (Opc == TargetOpcode::G_INTRINSIC) { Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); @@ -4635,9 +4638,6 @@ class statOptions { if (IntrinsicID == Intrinsic::amdgcn_fdot2) HasNeg = true; } - - // Assume all complex pattern of VOP3P has opsel - HasOpsel = true; } bool checkOptions(SrcStatus Stat) const { if (!HasNeg && @@ -4654,14 +4654,14 @@ class statOptions { static SmallVector> getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, - statOptions StatOptions, int MaxDepth = 6) { + searchOptions SearchOptions, int MaxDepth = 6) { int Depth = 0; auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI); SmallVector, 4> Statlist; while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (StatOptions.checkOptions(Curr.value().second)) { + if (SearchOptions.checkOptions(Curr.value().second)) { Statlist.push_back(Curr.value()); } Curr = calcNextStatus(Curr.value(), MRI); @@ -4672,7 +4672,7 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, static std::pair getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, - statOptions StatOptions, int MaxDepth = 6) { + searchOptions SearchOptions, int MaxDepth = 6) { int Depth = 0; std::pair LastSameOrNeg = { Op, SrcStatus::IS_SAME}; @@ -4680,7 +4680,7 @@ getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI, while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (StatOptions.checkOptions(Curr.value().second)) { + if (SearchOptions.checkOptions(Curr.value().second)) { if (Curr.value().second == SrcStatus::IS_SAME || Curr.value().second == SrcStatus::IS_HI_NEG || Curr.value().second == SrcStatus::IS_LO_NEG || @@ -4770,10 +4770,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, return {Op, Mods}; } - statOptions StatOptions(Op, MRI); + searchOptions SearchOptions(Op, MRI); std::pair Stat = - getLastSameOrNeg(Op, MRI, StatOptions); + getLastSameOrNeg(Op, MRI, SearchOptions); if (!Stat.first->isReg()) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4795,7 +4795,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, } SmallVector> StatlistHi = - getSrcStats(&MI->getOperand(2), MRI, StatOptions); + getSrcStats(&MI->getOperand(2), MRI, SearchOptions); if (StatlistHi.size() == 0) { Mods |= SISrcMods::OP_SEL_1; @@ -4803,7 +4803,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, } SmallVector> StatlistLo = - getSrcStats(&MI->getOperand(1), MRI, StatOptions); + getSrcStats(&MI->getOperand(1), MRI, SearchOptions); if (StatlistLo.size() == 0) { Mods |= SISrcMods::OP_SEL_1; @@ -4846,10 +4846,17 @@ static bool checkRB(const MachineOperand *Op, unsigned int RBNo, return RB->getID() == RBNo; } -const MachineOperand * -getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, - const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { +// This function is used to get the correct register bank for returned reg +// Assume: +// 1. VOP3P is always legal for VGPR +// 2. RootOp's regbank is legal +// Thus +// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR +// 2. If RootOp is VGPR, then NewOp must be VGPR +static const MachineOperand * +getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp, + const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { // RootOp can only be VGPR or SGPR (some hand written cases such as // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || @@ -4877,18 +4884,18 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp, } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); - - auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI); +AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root, + bool IsDOT) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, IsDOT); if (!(Op->isReg())) return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; - Op = getVReg(Op, &Root, RBI, MRI, TRI, TII); + Op = getLegalRegBank(Op, &Root, RBI, MRI, TRI, TII); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods @@ -4896,22 +4903,15 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); +AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { - auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, true); - if (!(Op->isReg())) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; + return selectVOP3PRetHelper(Root); +} - Op = getVReg(Op, &Root, RBI, MRI, TRI, TII); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + + return selectVOP3PRetHelper(Root, true); } InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index dd172edfdf216..d77fab99d7251 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -190,6 +190,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { std::pair selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI, bool IsDOT = false) const; + InstructionSelector::ComplexRendererFns + selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; From 0eac2e9108c17ca4b88e88ca8804802c25cd4917 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 11 Apr 2025 13:23:18 +0800 Subject: [PATCH 20/20] fix comments and case changes --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 62 +++++++++---------- .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll | 4 +- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 1e0159457d0fa..87af467ac8f1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4324,11 +4324,11 @@ enum class SrcStatus { IS_UPPER_HALF, IS_LOWER_HALF, IS_UPPER_HALF_NEG, - // This means current op = [op_upper, op_lower] and src = -op_lower + // This means current op = [op_upper, op_lower] and src = -op_lower. IS_LOWER_HALF_NEG, IS_HI_NEG, // This means current op = [op_upper, op_lower] and src = [op_upper, - // -op_lower] + // -op_lower]. IS_LO_NEG, IS_BOTH_NEG, INVALID, @@ -4548,8 +4548,9 @@ static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S, return SrcStatus::IS_LOWER_HALF_NEG; } break; + default: + llvm_unreachable("unexpected SrcStatus"); } - llvm_unreachable("unexpected SrcStatus"); } static std::optional> @@ -4558,15 +4559,9 @@ calcNextStatus(std::pair Curr, if (!Curr.first->isReg()) return std::nullopt; - const MachineInstr *MI = nullptr; - - if (!Curr.first->isDef()) - MI = MRI.getVRegDef(Curr.first->getReg()); - else - MI = Curr.first->getParent(); - - if (!MI) - return std::nullopt; + const MachineInstr *MI = Curr.first->isDef() + ? Curr.first->getParent() + : MRI.getVRegDef(Curr.first->getReg()); unsigned Opc = MI->getOpcode(); @@ -4626,7 +4621,7 @@ calcNextStatus(std::pair Curr, class searchOptions { private: bool HasNeg = false; - // Assume all complex pattern of VOP3P has opsel + // Assume all complex pattern of VOP3P has opsel. bool HasOpsel = true; public: @@ -4635,11 +4630,11 @@ class searchOptions { unsigned Opc = MI->getOpcode(); if (Opc < TargetOpcode::GENERIC_OP_END) { - // Keep same for generic op + // Keep same for generic op. HasNeg = true; } else if (Opc == TargetOpcode::G_INTRINSIC) { Intrinsic::ID IntrinsicID = cast(*MI).getIntrinsicID(); - // Only float point intrinsic has neg & neg_hi bits + // Only float point intrinsic has neg & neg_hi bits. if (IntrinsicID == Intrinsic::amdgcn_fdot2) HasNeg = true; } @@ -4666,9 +4661,8 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI, while (Depth <= MaxDepth && Curr.has_value()) { Depth++; - if (SearchOptions.checkOptions(Curr.value().second)) { + if (SearchOptions.checkOptions(Curr.value().second)) Statlist.push_back(Curr.value()); - } Curr = calcNextStatus(Curr.value(), MRI); } @@ -4712,9 +4706,9 @@ static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2, static bool isSameOperand(const MachineOperand *Op1, const MachineOperand *Op2) { - if (Op1->isReg()) { + if (Op1->isReg()) return Op2->isReg() && Op1->getReg() == Op2->getReg(); - } + return Op1->isIdenticalTo(*Op2); } @@ -4769,7 +4763,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, bool IsDOT) const { unsigned Mods = 0; const MachineOperand *Op = RootOp; - // No modification if Root type is not form of <2 x Type> + // No modification if Root type is not form of <2 x Type>. if (isVectorOfTwoOrScalar(Op, MRI) != TypeClass::VECTOR_OF_TWO) { Mods |= SISrcMods::OP_SEL_1; return {Op, Mods}; @@ -4815,13 +4809,13 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp, return {Op, Mods}; } - for (int i = StatlistHi.size() - 1; i >= 0; i--) { - for (int j = StatlistLo.size() - 1; j >= 0; j--) { - if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) && - isValidToPack(StatlistHi[i].second, StatlistLo[j].second, - StatlistHi[i].first, RootOp, TII, MRI)) - return {StatlistHi[i].first, - updateMods(StatlistHi[i].second, StatlistLo[j].second, Mods)}; + for (int I = StatlistHi.size() - 1; I >= 0; I--) { + for (int J = StatlistLo.size() - 1; J >= 0; J--) { + if (isSameOperand(StatlistHi[I].first, StatlistLo[J].first) && + isValidToPack(StatlistHi[I].second, StatlistLo[J].second, + StatlistHi[I].first, RootOp, TII, MRI)) + return {StatlistHi[I].first, + updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)}; } } // Packed instructions do not have abs modifiers. @@ -4851,18 +4845,18 @@ static bool checkRB(const MachineOperand *Op, unsigned int RBNo, return RB->getID() == RBNo; } -// This function is used to get the correct register bank for returned reg +// This function is used to get the correct register bank for returned reg. // Assume: -// 1. VOP3P is always legal for VGPR -// 2. RootOp's regbank is legal +// 1. VOP3P is always legal for VGPR. +// 2. RootOp's regbank is legal. // Thus -// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR -// 2. If RootOp is VGPR, then NewOp must be VGPR +// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR. +// 2. If RootOp is VGPR, then NewOp must be VGPR. static const MachineOperand * getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII) { - // RootOp can only be VGPR or SGPR (some hand written cases such as + // RootOp can only be VGPR or SGPR (some hand written cases such as. // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) @@ -4884,7 +4878,7 @@ getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp, BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) .addReg(NewOp->getReg()); - // only accept VGPR + // only accept VGPR. return &MIB->getOperand(0); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index 9f0641b715d36..534b454775502 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -218,7 +218,7 @@ define <2 x half> @fmul_v2_half_neg_lo(<2 x half> %a, <2 x half> %b) #0 { ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -250,7 +250,7 @@ define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ;