diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ebe38de1636be..4ca1011ea1312 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -501,6 +501,9 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; +void initializeAMDGPULowerVGPREncodingPass(PassRegistry &); +extern char &AMDGPULowerVGPREncodingID; + void initializeSIInsertHardClausesLegacyPass(PassRegistry &); extern char &SIInsertHardClausesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp new file mode 100644 index 0000000000000..ca06c316c2bfc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -0,0 +1,354 @@ +//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lower VGPRs above first 256 on gfx1250. +/// +/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch +/// VGPR addressing mode. The mode change is effective until the next change. +/// This instruction provides high bits of a VGPR address for four of the +/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the +/// instruction encoding. If bits are set they are added as MSB to the +/// corresponding operand VGPR number. +/// +/// There is no need to replace actual register operands because encoding of the +/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does +/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high +/// VGPRs will survive until actual encoding and will result in a same actual +/// bit encoding. +/// +/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset +/// to a VGPR address of the subseqent instructions. The InstPrinter will take +/// care of the printing a low VGPR instead of a high one. In prinicple this +/// shall be viable to print actual high VGPR numbers, but that would disagree +/// with a disasm printing and create a situation where asm text is not +/// deterministic. +/// +/// This pass creates a convention where non-fall through basic blocks shall +/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable. +/// An optimization here is possible but deemed not desirable because of the +/// readbility concerns. +/// +/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry. +/// The pass must run very late in the pipeline to make sure no changes to VGPR +/// operands will be made after it. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PackedVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding" + +namespace { + +class AMDGPULowerVGPREncoding : public MachineFunctionPass { + static constexpr unsigned OpNum = 4; + static constexpr unsigned BitsPerField = 2; + static constexpr unsigned NumFields = 4; + static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + using ModeType = PackedVector>; + + class ModeTy : public ModeType { + public: + // bitset constructor will set all bits to zero + ModeTy() : ModeType(0) {} + + operator int64_t() const { return raw_bits().to_ulong(); } + + static ModeTy fullMask() { + ModeTy M; + M.raw_bits().flip(); + return M; + } + }; + +public: + static char ID; + + AMDGPULowerVGPREncoding() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + + /// Most recent s_set_* instruction. + MachineInstr *MostRecentModeSet; + + /// Whether the current mode is known. + bool CurrentModeKnown; + + /// Current mode bits. + ModeTy CurrentMode; + + /// Current mask of mode bits that instructions since MostRecentModeSet care + /// about. + ModeTy CurrentMask; + + /// Number of current hard clause instructions. + unsigned ClauseLen; + + /// Number of hard clause instructions remaining. + unsigned ClauseRemaining; + + /// Clause group breaks. + unsigned ClauseBreaks; + + /// Last hard clause instruction. + MachineInstr *Clause; + + /// Insert mode change before \p I. \returns true if mode was changed. + bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + + /// Reset mode to default. + void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + + /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. + std::optional getMSBs(const MachineOperand &MO) const; + + /// Handle single \p MI. \return true if changed. + bool runOnMachineInstr(MachineInstr &MI); + + /// Compute the mode and mode mask for a single \p MI given \p Ops operands + /// bit mapping. Optionally takes second array \p Ops2 for VOPD. + /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2 + /// is checked. + void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI, + const AMDGPU::OpName Ops[OpNum], + const AMDGPU::OpName *Ops2 = nullptr); + + /// Check if an instruction \p I is within a clause and returns a suitable + /// iterator to insert mode change. It may also modify the S_CLAUSE + /// instruction to extend it or drop the clause if it cannot be adjusted. + MachineInstr *handleClause(MachineInstr *I); +}; + +bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, + MachineInstr *I) { + assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); + + if (CurrentModeKnown) { + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } + + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; + + MostRecentModeSet->getOperand(0).setImm(CurrentMode); + return true; + } + } + + I = handleClause(I); + MostRecentModeSet = + BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode); + + CurrentMode = NewMode; + CurrentMask = Mask; + CurrentModeKnown = true; + return true; +} + +std::optional +AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const { + if (!MO.isReg()) + return std::nullopt; + + MCRegister Reg = MO.getReg(); + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); + if (!RC || !TRI->isVGPRClass(RC)) + return std::nullopt; + + unsigned Idx = TRI->getHWRegIndex(Reg); + return Idx >> 8; +} + +void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask, + MachineInstr &MI, + const AMDGPU::OpName Ops[OpNum], + const AMDGPU::OpName *Ops2) { + NewMode = {}; + Mask = {}; + + for (unsigned I = 0; I < OpNum; ++I) { + MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]); + + std::optional MSBits; + if (Op) + MSBits = getMSBs(*Op); + +#if !defined(NDEBUG) + if (MSBits.has_value() && Ops2) { + auto Op2 = TII->getNamedOperand(MI, Ops2[I]); + if (Op2) { + std::optional MSBits2; + MSBits2 = getMSBs(*Op2); + if (MSBits2.has_value() && MSBits != MSBits2) + llvm_unreachable("Invalid VOPD pair was created"); + } + } +#endif + + if (!MSBits.has_value() && Ops2) { + Op = TII->getNamedOperand(MI, Ops2[I]); + if (Op) + MSBits = getMSBs(*Op); + } + + if (!MSBits.has_value()) + continue; + + // Skip tied uses of src2 of VOP2, these will be handled along with defs and + // only vdst bit affects these operands. We cannot skip tied uses of VOP3, + // these uses are real even if must match the vdst. + if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() && + (SIInstrInfo::isVOP2(MI) || + (SIInstrInfo::isVOP3(MI) && + TII->hasVALU32BitEncoding(MI.getOpcode())))) + continue; + + NewMode[I] = MSBits.value(); + Mask[I] = FieldMask; + } +} + +bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { + auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc()); + if (Ops.first) { + ModeTy NewMode, Mask; + computeMode(NewMode, Mask, MI, Ops.first, Ops.second); + return setMode(NewMode, Mask, &MI); + } + assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); + + return false; +} + +MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { + if (!ClauseRemaining) + return I; + + // A clause cannot start with a special instruction, place it right before + // the clause. + if (ClauseRemaining == ClauseLen) { + I = Clause->getPrevNode(); + assert(I->isBundle()); + return I; + } + + // If a clause defines breaks each group cannot start with a mode change. + // just drop the clause. + if (ClauseBreaks) { + Clause->eraseFromBundle(); + ClauseRemaining = 0; + return I; + } + + // Otherwise adjust a number of instructions in the clause if it fits. + // If it does not clause will just become shorter. Since the length + // recorded in the clause is one less, increment the length after the + // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63. + if (ClauseLen < 63) + Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8)); + + ++ClauseLen; + + return I; +} + +bool AMDGPULowerVGPREncoding::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.has1024AddressableVGPRs()) + return false; + + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + bool Changed = false; + ClauseLen = ClauseRemaining = 0; + CurrentMode.reset(); + CurrentMask.reset(); + CurrentModeKnown = true; + for (auto &MBB : MF) { + MostRecentModeSet = nullptr; + + for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { + if (MI.isMetaInstruction()) + continue; + + if (MI.isTerminator() || MI.isCall()) { + if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + CurrentMode.reset(); + CurrentModeKnown = true; + } else + resetMode(&MI); + continue; + } + + if (MI.isInlineAsm()) { + if (TII->hasVGPRUses(MI)) + resetMode(&MI); + continue; + } + + if (MI.getOpcode() == AMDGPU::S_CLAUSE) { + assert(!ClauseRemaining && "Nested clauses are not supported"); + ClauseLen = MI.getOperand(0).getImm(); + ClauseBreaks = (ClauseLen >> 8) & 15; + ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1; + Clause = &MI; + continue; + } + + Changed |= runOnMachineInstr(MI); + + if (ClauseRemaining) + --ClauseRemaining; + } + + // If we're falling through to a block that has at least one other + // predecessor, we no longer know the mode. + MachineBasicBlock *Next = MBB.getNextNode(); + if (Next && Next->pred_size() >= 2 && + llvm::is_contained(Next->predecessors(), &MBB)) { + if (CurrentMode.raw_bits().any()) + CurrentModeKnown = false; + } + } + + return Changed; +} + +} // namespace + +char AMDGPULowerVGPREncoding::ID = 0; + +char &llvm::AMDGPULowerVGPREncodingID = AMDGPULowerVGPREncoding::ID; + +INITIALIZE_PASS(AMDGPULowerVGPREncoding, DEBUG_TYPE, + "AMDGPU Lower VGPR Encoding", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c84a0f6e31384..6acbf52b97de5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { MF->getInfo(), *OutStreamer); + if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { + unsigned V = MI->getOperand(0).getImm(); + OutStreamer->AddComment( + " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4a2f0a13b1325..072becb9a2ad5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -584,6 +584,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); + initializeAMDGPULowerVGPREncodingPass(*PR); initializeSIInsertHardClausesLegacyPass(*PR); initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); @@ -1799,6 +1800,8 @@ void GCNPassConfig::addPreEmitPass() { addPass(&AMDGPUWaitSGPRHazardsLegacyID); + addPass(&AMDGPULowerVGPREncodingID); + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a915c4076ca2a..aae56eef73edd 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUIGroupLP.cpp + AMDGPULowerVGPREncoding.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ad122390e1f03..d1e8b7e4bad0d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -324,6 +324,18 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, } } +// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or +// \p Reg itself otherwise. +static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + if (Idx < 0x100) + return Reg; + + const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); + return RC->getRegister(Idx % 0x100); +} + void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI) { #if !defined(NDEBUG) @@ -337,7 +349,17 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, } #endif - O << getRegisterName(Reg); + unsigned PrintReg = getRegForPrinting(Reg, MRI); + O << getRegisterName(PrintReg); + + if (PrintReg != Reg.id()) + O << " /*" << getRegisterName(Reg) << "*/"; +} + +void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, unsigned Opc, + unsigned OpNo, raw_ostream &O, + const MCRegisterInfo &MRI) { + printRegOperand(Reg, O, MRI); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -722,7 +744,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - printRegOperand(Op.getReg(), O, MRI); + printRegOperand(Op.getReg(), MI->getOpcode(), OpNo, O, MRI); // Check if operand register class contains register used. // Intention: print disassembler message when invalid code is decoded, @@ -1133,7 +1155,7 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, OpNo = OpNo - N + N / 2; if (En & (1 << N)) - printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); + printRegOperand(MI->getOperand(OpNo).getReg(), Opc, OpNo, O, MRI); else O << "off"; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index a92f99c3c0e4b..21cc2f229de91 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -35,6 +35,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O) override; static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI); + void printRegOperand(MCRegister Reg, unsigned Opc, unsigned OpNo, + raw_ostream &O, const MCRegisterInfo &MRI); private: void printU16ImmOperand(const MCInst *MI, unsigned OpNo, @@ -70,7 +72,7 @@ class AMDGPUInstPrinter : public MCInstPrinter { void printSymbolicFormat(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); - void printRegOperand(unsigned RegNo, raw_ostream &O); + void printRegOperand(MCRegister Reg, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index fe849cafb65d1..643c664e39f1e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9270,6 +9270,9 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const { + if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES) + return nullptr; + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); if (Idx == -1) return nullptr; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index ff5cbd55484cf..6348d3607878e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3338,6 +3338,112 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } +const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, + const MCRegisterInfo &MRI) { + const unsigned VGPRClasses[] = { + AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID, + AMDGPU::VReg_64RegClassID, AMDGPU::VReg_96RegClassID, + AMDGPU::VReg_128RegClassID, AMDGPU::VReg_160RegClassID, + AMDGPU::VReg_192RegClassID, AMDGPU::VReg_224RegClassID, + AMDGPU::VReg_256RegClassID, AMDGPU::VReg_288RegClassID, + AMDGPU::VReg_320RegClassID, AMDGPU::VReg_352RegClassID, + AMDGPU::VReg_384RegClassID, AMDGPU::VReg_512RegClassID, + AMDGPU::VReg_1024RegClassID}; + + for (unsigned RCID : VGPRClasses) { + const MCRegisterClass &RC = MRI.getRegClass(RCID); + if (RC.contains(Reg)) + return &RC; + } + + return nullptr; +} + +unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + return Idx >> 8; +} + +MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, + const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + if (Idx >= 0x100) + return AMDGPU::NoRegister; + + const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); + if (!RC) + return AMDGPU::NoRegister; + return RC->getRegister(Idx | (MSBs << 8)); +} + +std::pair +getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { + static const AMDGPU::OpName VOPOps[4] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, + AMDGPU::OpName::vdst}; + static const AMDGPU::OpName VDSOps[4] = { + AMDGPU::OpName::addr, AMDGPU::OpName::data0, AMDGPU::OpName::data1, + AMDGPU::OpName::vdst}; + static const AMDGPU::OpName FLATOps[4] = { + AMDGPU::OpName::vaddr, AMDGPU::OpName::vdata, + AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdst}; + static const AMDGPU::OpName BUFOps[4] = { + AMDGPU::OpName::vaddr, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdata}; + static const AMDGPU::OpName VIMGOps[4] = { + AMDGPU::OpName::vaddr0, AMDGPU::OpName::vaddr1, AMDGPU::OpName::vaddr2, + AMDGPU::OpName::vdata}; + + // For VOPD instructions MSB of a corresponding Y component operand VGPR + // address is supposed to match X operand, otherwise VOPD shall not be + // combined. + static const AMDGPU::OpName VOPDOpsX[4] = { + AMDGPU::OpName::src0X, AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vsrc2X, + AMDGPU::OpName::vdstX}; + static const AMDGPU::OpName VOPDOpsY[4] = { + AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y, + AMDGPU::OpName::vdstY}; + + unsigned TSFlags = Desc.TSFlags; + + if (TSFlags & + (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 | + SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) { + // LD_SCALE operands ignore MSB. + if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250) + return {}; + return {VOPOps, nullptr}; + } + + if (TSFlags & SIInstrFlags::DS) + return {VDSOps, nullptr}; + + if (TSFlags & SIInstrFlags::FLAT) + return {FLATOps, nullptr}; + + if (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF)) + return {BUFOps, nullptr}; + + if (TSFlags & SIInstrFlags::VIMAGE) + return {VIMGOps, nullptr}; + + if (AMDGPU::isVOPD(Desc.getOpcode())) + return {VOPDOpsX, VOPDOpsY}; + + assert(!(TSFlags & SIInstrFlags::MIMG)); + + if (TSFlags & (SIInstrFlags::VSAMPLE | SIInstrFlags::EXP)) + llvm_unreachable("Sample and export VGPR lowering is not implemented and" + " these instructions are not expected on gfx1250"); + + return {}; +} + bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { uint64_t TSFlags = MII.get(Opcode).TSFlags; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 4ab17d8056459..3fcd16f9290b1 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1786,6 +1786,25 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID); /// \returns true if the intrinsic is uniform bool isIntrinsicAlwaysUniform(unsigned IntrID); +/// \returns a register class for the physical register \p Reg if it is a VGPR +/// or nullptr otherwise. +const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, + const MCRegisterInfo &MRI); + +/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the +/// physical register \p Reg. +unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI); + +/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set. +MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, + const MCRegisterInfo &MRI); + +// Returns a table for the opcode with a given \p Desc to map the VGPR MSB +// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2 +// maps, one for X and one for Y component. +std::pair +getVGPRLoweringOperandTables(const MCInstrDesc &Desc); + /// \returns true if a memory instruction supports scale_offset modifier. bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode); diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 36231abda87db..65d0102a9d0dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -141,6 +141,7 @@ ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards +; GCN-O0-NEXT: AMDGPU Lower VGPR Encoding ; GCN-O0-NEXT: Branch relaxation pass ; GCN-O0-NEXT: Register Usage Information Collector Pass ; GCN-O0-NEXT: Remove Loads Into Fake Uses @@ -426,6 +427,7 @@ ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer ; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards +; GCN-O1-NEXT: AMDGPU Lower VGPR Encoding ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass ; GCN-O1-NEXT: Register Usage Information Collector Pass @@ -740,6 +742,7 @@ ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer ; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards +; GCN-O1-OPTS-NEXT: AMDGPU Lower VGPR Encoding ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass @@ -1060,6 +1063,7 @@ ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer ; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards +; GCN-O2-NEXT: AMDGPU Lower VGPR Encoding ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass ; GCN-O2-NEXT: Register Usage Information Collector Pass @@ -1393,6 +1397,7 @@ ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer ; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards +; GCN-O3-NEXT: AMDGPU Lower VGPR Encoding ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass ; GCN-O3-NEXT: Register Usage Information Collector Pass diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir new file mode 100644 index 0000000000000..e7d676c6ba05c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -0,0 +1,848 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-lower-vgpr-encoding -o - %s | FileCheck -check-prefixes=GCN,ASM %s + +# ASM-LABEL: {{^}}high_vgprs: +# DIS-LABEL: : +--- +name: high_vgprs +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; VOP1 + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v255 /*v511*/ + $vgpr256 = V_MOV_B32_e32 undef $vgpr511, implicit $exec + + ; No mask change + ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v254 /*v510*/ + $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec + + ; Single bit change + ; GCN-NEXT: s_set_vgpr_msb 1 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 + ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/ + $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode + + ; Reset + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: v_rcp_f32_e64 v255, v1 + $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode + + ; VOP2 + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=1 src2=0 + ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/ + $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 + ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/ + $vgpr258 = V_ADD_F32_e64 0, $vgpr0, 0, undef $vgpr507, 0, 0, implicit $exec, implicit $mode + + ; VOP3 + + ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 + ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ + $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode + + ; No change + ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ + $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode + + ; Tuple crossing the 256 boundary + ; GCN-NEXT: s_set_vgpr_msb 17 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 + ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/ + $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec + + ; DPP/tied operand + ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0 + ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 + $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 17 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 + ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 + $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec + + ; DS (addr, data0, and data1 operands) + + ; GCN-NEXT: s_set_vgpr_msb 20 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1 + ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1 + DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec + + ; Reset + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1 + DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 + ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/ + $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 + ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/ + $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec + + ; Reset + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0 + $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec + + ; FLAT (vaddr, vdata and vdst operands) + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 + ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off + $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0 + ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1] + $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 + ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0 + $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: global_store_b32 v[0:1], v2, off + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; ASM-SAME: ; msbs: dst=0 src0=1 src1=1 src2=0 + ; GCN-NEXT: global_store_b32 v[0:1] /*v[256:257]*/, v255 /*v511*/, off + GLOBAL_STORE_DWORD $vgpr256_vgpr257, $vgpr511, 0, 0, implicit $exec + + ; No change + ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off + GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 + ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN + $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr + + ; Reset + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN + $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr + + ; VBUFFER (vdata, vaddr operands) + + ; GCN-NEXT: buffer_load_b32 v1, v0, s[8:11], s3 offen + $vgpr1 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0 + ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen + $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 + ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen + $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen + BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 + ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen + BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen + BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 + ; GCN-NEXT: buffer_atomic_add_f32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen + BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec + + ; VGPRs above 512 + + ; GCN-NEXT: s_set_vgpr_msb 0xaa + ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2 + ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ + $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xab + ; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2 + ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/ + $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xae + ; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2 + ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/ + $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xba + ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3 + ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/ + $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xea + ; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2 + ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ + $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xff + ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3 + ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/ + $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x42 + ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/ + $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec + + ; Reset + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3 + $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode + + ; Tuples + + ; GCN-NEXT: s_set_vgpr_msb 10 + ; ASM-SAME: ; msbs: dst=0 src0=2 src1=2 src2=0 + ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off + GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 11 + ; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0 + ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off + GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 + ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/ + early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec + + ; ASM: NumVgprs: 1024 + +... + +# ASM-LABEL: {{^}}vopd: +# DIS-LABEL: : +--- +name: vopd +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; GCN-NEXT: v_dual_sub_f32 v255, v1, v1 :: v_dual_mul_f32 v6, v0, v0 + $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr1, undef $vgpr0, undef $vgpr0, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4 + $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4 + $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/ + $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3 + $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1 + $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/ + $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/ + $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 16 + ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/ + $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3 + $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5 + $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/ + $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec + + ; ASM: NumVgprs: 777 + +... + +# ASM-LABEL: {{^}}fmaak_fmamk: +# DIS-LABEL: : +--- +name: fmaak_fmamk +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1 + $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1 + $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1 + $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1 + $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/ + $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/ + $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 + $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/ + $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode + + ; ASM: NumVgprs: 259 + +... + +# ASM-LABEL: {{^}}fmac: +# DIS-LABEL: : +--- +name: fmac +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; Accumulation instructions apply DST to both the destination and one of the source VGPRs + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_fmac_f32_e64 v0 /*v256*/, |v0|, |v1| clamp mul:4 + $vgpr256 = V_FMAC_F32_e64 2, undef $vgpr0, 2, undef $vgpr1, 2, undef $vgpr256, 1, 2, implicit $mode, implicit $exec + + ; GCN-NEXT: v_fmac_f32_e32 v1 /*v257*/, v0, v1 + $vgpr257 = V_FMAC_F32_e32 undef $vgpr0, undef $vgpr1, undef $vgpr257, implicit $mode, implicit $exec + + ; ASM: NumVgprs: 258 + +... + +# ASM-LABEL: {{^}}rev_opcodes: +# DIS-LABEL: : +--- +name: rev_opcodes +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; V_LSHLREV, V_SUBREV: SRC0 and SRC1 apply to the operands in the order in the ISA (before "reversing") + ; e.g. v_lshlrev_b32 v0(vdst), v1(src0), v2(src1) // v0 = v2 << v1 + ; DST applies to V0, SRC0 applies to V1, and SRC1 applies to V2. + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2 + $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/ + $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2 + $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/ + $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec + + ; ASM: NumVgprs: 257 +... + +# ASM-LABEL: {{^}}minimal_mode_change: +# DIS-LABEL: : +--- +name: minimal_mode_change +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ + $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2 + $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec + + ; GCN-NEXT: v_mov_b32_e32 v0, v1 + $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v3 /*v259*/, v1 + $vgpr259 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + + ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2 + $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2 + $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1 + $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: v_mov_b32_e32 v0, v1 + $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + + ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/ + $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/ + ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/ + $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec + $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec + $vgpr2 = V_ADD_U32_e32 undef $vgpr258, undef $vgpr258, implicit $exec + + ; ASM: NumVgprs: 263 + +... + +# ASM-LABEL: {{^}}terminators: +# DIS-LABEL: : +--- +name: terminators +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: s_branch + S_NOP 0 + S_BRANCH %bb.1 + + ; No mode switch if it was zero + + bb.1: + ; ASM: .LBB{{.*_1}}: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + + ; No mode switch on fall through + + bb.2: + ; ASM-NEXT: %bb.2: + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_branch + S_NOP 0 + S_BRANCH %bb.3 + + ; Reset mode on terminator + + bb.3: + ; ASM: .LBB{{.*_3}}: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_swap_pc_i64 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1 + + ; Reset mode before a call + + bb.4: + ; ASM-NEXT: %bb.4: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_endpgm + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + S_ENDPGM 0 + + ; No mode reset before S_ENDPGM + + bb.5: + ; ASM-NEXT: %bb.5: + ; GCN-NEXT: v_mov_b32_e32 v0, v1 + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_pc_i64 + $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + S_SETPC_B64 undef $sgpr0_sgpr1, implicit-def $exec + + ; Assume mode zero at block begin even if we did not reset if before + ; Reset mode before branch + + bb.6: + ; ASM-NEXT: %bb.6: + ; GCN-NEXT: s_set_pc_i64 + S_SETPC_B64 undef $sgpr0_sgpr1, implicit-def $exec + + ; But do not reset mode before a branch if it was zero + + bb.7: + ; ASM-NEXT: %bb.7: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM-NEXT: ; return to shader part epilog + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec + + ; Reset mode before returning from a call + + bb.8: + ; ASM-NEXT: %bb.8: + ; ASM-NEXT: ; return to shader part epilog + SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec + + ; But do not reset mode before a call return if it was zero + + bb.9: + ; ASM-NEXT: %bb.9: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_pc_i64 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec + + ; ASM: NumVgprs: 257 +... + +# ASM-LABEL: {{^}}control_flow: +# DIS-LABEL: : +--- +name: control_flow +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 + $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec + + bb.1: + ; ASM: .LBB{{[0-9]+}}_1: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_cbranch_scc0 + $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + S_CBRANCH_SCC0 %bb.1, undef implicit $scc + + bb.2: + ; ASM: %bb.2: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v2 + ; GCN-NEXT: s_endpgm + $vgpr258 = V_MOV_B32_e32 undef $vgpr2, implicit $exec + S_ENDPGM 0 +... + +# ASM-LABEL: {{^}}inline_asm: +# DIS-LABEL: : +--- +name: inline_asm +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; ASM: def v0 + ; GCN-NOT: s_set_vgpr_msb + ; ASM: use v0 + ; GCN-NOT: s_set_vgpr_msb + ; ASM: use v1 + ; GCN: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NOT: s_set_vgpr_msb + ; ASM: no vgprs, mode preserved + ; GCN-NOT: s_set_vgpr_msb + ; GCN: v_mov_b32_e32 v0 /*v256*/, v1 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + INLINEASM &"; def v0", 1, 327690, def $vgpr0 + INLINEASM &"; use v0", 1, 327690, $vgpr0 + INLINEASM &"; use v1", 1, 327690, undef $vgpr1 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + INLINEASM &"; no vgprs, mode preserved", 1, 327690, undef $sgpr0 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + + ; ASM: NumVgprs: 257 +... + +# ASM-LABEL: {{^}}bundle: +# DIS-LABEL: : +--- +name: bundle +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/ + BUNDLE implicit-def $vgpr256 { + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + } + BUNDLE implicit $vgpr256 { + S_NOP 0 + $vgpr1 = V_MOV_B32_e32 $vgpr256, implicit $exec + } + + ; ASM: NumVgprs: 257 +... + +# ASM-LABEL: {{^}}hard_clauses: +# DIS-LABEL: : +--- +name: hard_clauses +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; s_set_vgpr_msb cannot be a first instruction in a clause and must be placed before it. + + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_clause 0x2 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 + ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v1 + BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr248, implicit undef $vgpr1 { + S_CLAUSE 2 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr258 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + } + + ; S_CLAUSE 515 means 4 instructions broken in groups of 2. + ; A mode change cannot be a first instruction of each group. + ; If we cannot insert a mode change right before the clause just drop it. + + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_mov_b32_e32 v2, v1 + ; GCN-NEXT: v_mov_b32_e32 v3, v1 + BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 { + S_CLAUSE 515 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr2 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr3 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + } + + ; Check that we properly update the clause length. + + ; GCN-NEXT: s_clause 0x3 + ; GCN-NEXT: v_mov_b32_e32 v0, v1 + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 + ; GCN-NEXT: v_mov_b32_e32 v2 /*v258*/, v1 + BUNDLE implicit-def $vgpr0, implicit-def $vgpr257, implicit-def $vgpr248, implicit undef $vgpr1 { + S_CLAUSE 2 + $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr258 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + } + + ; Check that we do not exceed the limit of 63 instructions or simm16 value of 62. + + ; GCN-NEXT: s_clause 0x3e + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_mov_b32_e32 v1, v1 + ; GCN-NEXT: v_mov_b32_e32 v2, v1 + ; GCN-COUNT-60: v_mov_b32_e32 v1, v1 + BUNDLE implicit-def $vgpr256, implicit-def $vgpr1, implicit-def $vgpr2, implicit undef $vgpr1 { + S_CLAUSE 62 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr2 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec + } + + ; ASM: NumVgprs: 259 +... + +# ASM-LABEL: {{^}}pseudo: +# DIS-LABEL: : +--- +name: pseudo +body: | + bb.0: + liveins: $vgpr0 + + $sgpr0 = SI_ILLEGAL_COPY killed $vgpr0, implicit-def $exec, implicit-def $vcc, implicit $exec + ; Just do not assert here. + ; ASM: illegal copy v0 to s0 + SI_RETURN_TO_EPILOG killed $sgpr0 + S_ENDPGM 0 +... + +# LD_SCALE operands ignores MSB and always use low 256 VGPRs. + +# ASM-LABEL: {{^}}ld_scale: +# DIS-LABEL: : +--- +name: ld_scale +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; GCN: s_set_vgpr_msb 5 + ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/ + $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec + + ; Do not change mode for LD_SCALE. + + ; GCN-NOT: s_set_vgpr_msb + ; GCN-NEXT: v_wmma_ld_scale_paired_b32 v1, v2 + V_WMMA_LD_SCALE_PAIRED_B32 undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NOT: s_set_vgpr_msb + ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2 + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2 + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[0:15], v[10:17], v1, v2 + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NOT: s_set_vgpr_msb + ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3] + V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3] + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3] + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + + ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[0:15], v[10:17], v[0:1], v[2:3] + $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +... diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 987fb042cd089..2208ae5622386 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -164,6 +164,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPULowerKernelArguments.cpp", "AMDGPULowerKernelAttributes.cpp", "AMDGPULowerModuleLDSPass.cpp", + "AMDGPULowerVGPREncoding.cpp", "AMDGPUMCInstLower.cpp", "AMDGPUMCResourceInfo.cpp", "AMDGPUMIRFormatter.cpp",