diff --git a/llvm/include/llvm/CodeGen/LiveStacks.h b/llvm/include/llvm/CodeGen/LiveStacks.h index 2edc2985f0ee6..b9e5598738390 100644 --- a/llvm/include/llvm/CodeGen/LiveStacks.h +++ b/llvm/include/llvm/CodeGen/LiveStacks.h @@ -34,6 +34,7 @@ class TargetRegisterInfo; class LiveStacks : public MachineFunctionPass { const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; /// Special pool allocator for VNInfo's (LiveInterval val#). /// diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index fcdd73d8b65fd..c7067d90a3ce0 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1586,7 +1586,7 @@ class MachineInstr const TargetRegisterClass *getRegClassConstraintEffectForVReg( Register Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, - bool ExploreBundle = false) const; + const MachineRegisterInfo &MRI, bool ExploreBundle = false) const; /// Applies the constraints (def/use) implied by the \p OpIdx operand /// to the given \p CurRC. @@ -1600,7 +1600,8 @@ class MachineInstr const TargetRegisterClass * getRegClassConstraintEffect(unsigned OpIdx, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) const; + const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const; /// Add a tie between the register operands at DefIdx and UseIdx. /// The tie will cause the register allocator to ensure that the two @@ -2005,7 +2006,8 @@ class MachineInstr /// If the related operand does not constrained Reg, this returns CurRC. const TargetRegisterClass *getRegClassConstraintEffectForVRegImpl( unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC, - const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const; + const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const; /// Stores extra instruction information inline or allocates as ExtraInfo /// based on the number of pointers. diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 09d9a0b4ec402..78414f31d1bf8 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -94,6 +94,16 @@ class MachineRegisterInfo { /// all registers that were disabled are removed from the list. SmallVector UpdatedCSRs; + /// The Synthetic field of regclasses. Targets can alter this vector + /// to enable classes dynamically during codegen. + BitVector RegClassSyntheticInfo; + + /// To have more restrictions on the allocation order for each register class. + /// For some targets, two distinct register classes can have the same set of + /// registers, but their allocatable set can vary. This vector helps targets + /// to dynamically determine the actual allocatable set for RCs. + std::vector RegClassAllocationMasks; + /// RegAllocHints - This vector records register allocation hints for /// virtual registers. For each virtual register, it keeps a pair of hint /// type and hints vector making up the allocation hints. Only the first @@ -257,6 +267,30 @@ class MachineRegisterInfo { /// Notice that it will override ant previously disabled/saved CSRs. void setCalleeSavedRegs(ArrayRef CSRs); + /// Initialize the RegClassSyntheticInfo. It sets the bit position as + /// exactly as the tablegened Synthetic field. Targets can later flip this + /// field to enable/disable the regclass whenever required. + void initializeRegClassSyntheticInfo(); + + /// Change the synthetic info for the regclass \p RC from \p Value. + void changeSyntheticInfoForRC(const TargetRegisterClass *RC, bool Value); + + /// This function checks if \p RC is enabled or not so that it can be included + /// in various regclass related queries. + bool isEnabled(const TargetRegisterClass *RC) const; + + /// Update dynamically determined allocation mask for register classes. + void updateAllocationMaskForRCs(std::vector &&Mask); + + // Return the allocation mask for regclass \p RC. + const BitVector &getAllocationMaskForRC(const TargetRegisterClass &RC) const; + + /// True when the target has dynamically determined allocation mask for its + /// register classes. + bool hasAllocationMaskForRCs() const { + return RegClassAllocationMasks.size(); + } + // Strictly for use by MachineInstr.cpp. void addRegOperandToUseList(MachineOperand *MO); diff --git a/llvm/include/llvm/CodeGen/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/RegisterBankInfo.h index 62c4a57a605d6..117f381512a1c 100644 --- a/llvm/include/llvm/CodeGen/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterBankInfo.h @@ -445,7 +445,8 @@ class RegisterBankInfo { /// Get the MinimalPhysRegClass for Reg. /// \pre Reg is a physical register. const TargetRegisterClass * - getMinimalPhysRegClass(Register Reg, const TargetRegisterInfo &TRI) const; + getMinimalPhysRegClass(Register Reg, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) const; /// Try to get the mapping of \p MI. /// See getInstrMapping for more details on what a mapping represents. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 59fad88f91b1d..5ef963cae5528 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4384,6 +4384,7 @@ class TargetLowering : public TargetLoweringBase { /// Returns 'true' is the edge is necessary, 'false' otherwise virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { return false; diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 117d3f7182974..a3d68d6743ede 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -121,6 +121,9 @@ class TargetRegisterClass { /// Return true if this register class has a defined BaseClassOrder. bool isBaseClass() const { return MC->isBaseClass(); } + /// Return true if this register class is marked synthetic. + bool isSynthetic() const { return MC->isSynthetic(); } + /// Return true if the specified TargetRegisterClass /// is a proper sub-class of this TargetRegisterClass. bool hasSubClass(const TargetRegisterClass *RC) const { @@ -338,20 +341,23 @@ class TargetRegisterInfo : public MCRegisterInfo { /// Returns the Register Class of a physical register of the given type, /// picking the most sub register class of the right type that contains this /// physreg. - const TargetRegisterClass *getMinimalPhysRegClass(MCRegister Reg, - MVT VT = MVT::Other) const; + const TargetRegisterClass * + getMinimalPhysRegClass(MCRegister Reg, const MachineRegisterInfo &MRI, + MVT VT = MVT::Other) const; /// Returns the Register Class of a physical register of the given type, /// picking the most sub register class of the right type that contains this /// physreg. If there is no register class compatible with the given type, /// returns nullptr. - const TargetRegisterClass *getMinimalPhysRegClassLLT(MCRegister Reg, - LLT Ty = LLT()) const; + const TargetRegisterClass * + getMinimalPhysRegClassLLT(MCRegister Reg, const MachineRegisterInfo &MRI, + LLT Ty = LLT()) const; /// Return the maximal subclass of the given register class that is /// allocatable or NULL. const TargetRegisterClass * - getAllocatableClass(const TargetRegisterClass *RC) const; + getAllocatableClass(const TargetRegisterClass *RC, + const MachineRegisterInfo &MRI) const; /// Returns a bitset indexed by register number indicating if a register is /// allocatable or not. If a register class is specified, returns the subset @@ -630,7 +636,8 @@ class TargetRegisterInfo : public MCRegisterInfo { /// TableGen will synthesize missing A sub-classes. virtual const TargetRegisterClass * getMatchingSuperRegClass(const TargetRegisterClass *A, - const TargetRegisterClass *B, unsigned Idx) const; + const TargetRegisterClass *B, unsigned Idx, + const MachineRegisterInfo &MRI) const; // For a copy-like instruction that defines a register of class DefRC with // subreg index DefSubReg, reading from another source with class SrcRC and @@ -639,7 +646,8 @@ class TargetRegisterInfo : public MCRegisterInfo { virtual bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const; + unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const; /// Returns the largest legal sub-class of RC that /// supports the sub-register index Idx. @@ -769,10 +777,11 @@ class TargetRegisterInfo : public MCRegisterInfo { /// corresponding argument register class. /// /// The function returns NULL if no register class can be found. - const TargetRegisterClass* + const TargetRegisterClass * getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, const TargetRegisterClass *RCB, unsigned SubB, - unsigned &PreA, unsigned &PreB) const; + unsigned &PreA, unsigned &PreB, + const MachineRegisterInfo &MRI) const; //===--------------------------------------------------------------------===// // Register Class Information @@ -809,8 +818,8 @@ class TargetRegisterInfo : public MCRegisterInfo { /// Find the largest common subclass of A and B. /// Return NULL if there is no common subclass. const TargetRegisterClass * - getCommonSubClass(const TargetRegisterClass *A, - const TargetRegisterClass *B) const; + getCommonSubClass(const TargetRegisterClass *A, const TargetRegisterClass *B, + const MachineRegisterInfo &MRI) const; /// Returns a TargetRegisterClass used for pointer values. /// If a target supports multiple different pointer register classes, diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h index fb4d11ec1d4d1..1a2ff58c31f5a 100644 --- a/llvm/include/llvm/MC/MCRegisterInfo.h +++ b/llvm/include/llvm/MC/MCRegisterInfo.h @@ -47,6 +47,7 @@ class MCRegisterClass { const int8_t CopyCost; const bool Allocatable; const bool BaseClass; + const bool Synthetic; /// getID() - Return the register class ID number. /// @@ -101,6 +102,9 @@ class MCRegisterClass { /// Return true if this register class has a defined BaseClassOrder. bool isBaseClass() const { return BaseClass; } + /// isSynthetic - Return true if this is a synthetic class. This field helps + /// targets to dynamically enable the regclass during codegen. + bool isSynthetic() const { return Synthetic; } }; /// MCRegisterDesc - This record contains information about a particular diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index cb1c0ed2513d4..0cf9159030bc5 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -338,6 +338,12 @@ class RegisterClass regTypes, int alignment, // Target-specific flags. This becomes the TSFlags field in TargetRegisterClass. bits<8> TSFlags = 0; + // If set to true, the register class won't take part in various regclass queries + // by default. This allows targets to dynamically enable classes for a period + // during codegen so that they can be turned allocatable, copyable, spillable, + // and/or make them available for various regclass queries. + bit Synthetic = false; + // If set then consider this register class to be the base class for registers in // its MemberList. The base class for registers present in multiple base register // classes will be resolved in the order defined by this value, with lower values diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index ed6ce6bc73d38..93860aae8acfd 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -604,7 +604,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( // check every use of the register and find the largest register class // that can be used in all of them. const TargetRegisterClass *SuperRC = - TRI->getMinimalPhysRegClass(SuperReg, MVT::Other); + TRI->getMinimalPhysRegClass(SuperReg, MRI, MVT::Other); ArrayRef Order = RegClassInfo.getOrder(SuperRC); if (Order.empty()) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 14f2a363f9be6..7b403fee7ed12 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -813,7 +813,7 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes( auto AddEntry = [&](const DbgValueLocEntry &Entry, DIExpressionCursor &Cursor) { if (Entry.isLocation()) { - if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, + if (!DwarfExpr.addMachineRegExpression(TRI, Asm->MF->getRegInfo(), Cursor, Entry.getLoc().getReg())) return false; } else if (Entry.isInt()) { @@ -910,7 +910,8 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes(const Loc::MMI &MMI, addOpAddress(*Loc, FrameSymbol); else DwarfExpr.addMachineRegExpression( - *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg); + *Asm->MF->getSubtarget().getRegisterInfo(), Asm->MF->getRegInfo(), + Cursor, FrameReg); DwarfExpr.addExpression(std::move(Cursor)); } if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) { @@ -940,7 +941,8 @@ void DwarfCompileUnit::applyConcreteDbgVariableAttributes( DIExpressionCursor Cursor(Expr.getElements()); DwarfExpr.beginEntryValueExpression(Cursor); DwarfExpr.addMachineRegExpression( - *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, Register); + *Asm->MF->getSubtarget().getRegisterInfo(), Asm->MF->getRegInfo(), + Cursor, Register); DwarfExpr.addExpression(std::move(Cursor)); } addBlock(VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); @@ -1557,7 +1559,8 @@ void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute, DIExpressionCursor Cursor({}); const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); - if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + if (!DwarfExpr.addMachineRegExpression(TRI, Asm->MF->getRegInfo(), Cursor, + Location.getReg())) return; DwarfExpr.addExpression(std::move(Cursor)); @@ -1587,7 +1590,8 @@ void DwarfCompileUnit::addComplexAddress(const DIExpression *DIExpr, DIE &Die, DwarfExpr.beginEntryValueExpression(Cursor); const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); - if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + if (!DwarfExpr.addMachineRegExpression(TRI, Asm->MF->getRegInfo(), Cursor, + Location.getReg())) return; DwarfExpr.addExpression(std::move(Cursor)); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 6b5ad62e083e3..0b372d7b0e1bf 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2652,7 +2652,8 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.beginEntryValueExpression(ExprCursor); const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); - if (!DwarfExpr.addMachineRegExpression(TRI, ExprCursor, Location.getReg())) + if (!DwarfExpr.addMachineRegExpression(TRI, AP.MF->getRegInfo(), ExprCursor, + Location.getReg())) return; return DwarfExpr.addExpression(std::move(ExprCursor)); } @@ -2673,7 +2674,8 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.setMemoryLocationKind(); const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); - if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + if (!DwarfExpr.addMachineRegExpression(TRI, AP.MF->getRegInfo(), Cursor, + Location.getReg())) return false; } else if (Entry.isTargetIndexLocation()) { TargetIndexLocation Loc = Entry.getTargetIndexLocation(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index a74d43897d45b..6d4794d27effb 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -97,6 +97,7 @@ void DwarfExpression::addAnd(unsigned Mask) { } bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, llvm::Register MachineReg, unsigned MaxSize) { if (!MachineReg.isPhysical()) { @@ -134,7 +135,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, // For example, Q0 on ARM is a composition of D0+D1. unsigned CurPos = 0; // The size of the register in bits. - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg, MRI); unsigned RegSize = TRI.getRegSizeInBits(*RC); // Keep track of the bits in the register we already emitted, so we // can avoid emitting redundant aliasing subregs. Because this is @@ -248,11 +249,13 @@ void DwarfExpression::addConstantFP(const APFloat &APF, const AsmPrinter &AP) { } bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, DIExpressionCursor &ExprCursor, llvm::Register MachineReg, unsigned FragmentOffsetInBits) { auto Fragment = ExprCursor.getFragmentInfo(); - if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) { + if (!addMachineReg(TRI, MRI, MachineReg, + Fragment ? Fragment->SizeInBits : ~1U)) { LocationKind = Unknown; return false; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 667a9efc6f6c0..70ab73b5996b4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -245,7 +245,8 @@ class DwarfExpression { /// multiple subregisters that alias the register. /// /// \return false if no DWARF register exists for MachineReg. - bool addMachineReg(const TargetRegisterInfo &TRI, llvm::Register MachineReg, + bool addMachineReg(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, llvm::Register MachineReg, unsigned MaxSize = ~1U); /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment. @@ -325,6 +326,7 @@ class DwarfExpression { /// \return false if no DWARF register exists /// for MachineReg. bool addMachineRegExpression(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, DIExpressionCursor &Expr, llvm::Register MachineReg, unsigned FragmentOffsetInBits = 0); diff --git a/llvm/lib/CodeGen/DetectDeadLanes.cpp b/llvm/lib/CodeGen/DetectDeadLanes.cpp index 86e9f3abe010d..57c5466129162 100644 --- a/llvm/lib/CodeGen/DetectDeadLanes.cpp +++ b/llvm/lib/CodeGen/DetectDeadLanes.cpp @@ -97,12 +97,12 @@ static bool isCrossCopy(const MachineRegisterInfo &MRI, unsigned PreA, PreB; // Unused. if (SrcSubIdx && DstSubIdx) return !TRI.getCommonSuperRegClass(SrcRC, SrcSubIdx, DstRC, DstSubIdx, PreA, - PreB); + PreB, MRI); if (SrcSubIdx) - return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx); + return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx, MRI); if (DstSubIdx) - return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx); - return !TRI.getCommonSubClass(SrcRC, DstRC); + return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx, MRI); + return !TRI.getCommonSubClass(SrcRC, DstRC, MRI); } void DeadLaneDetector::addUsedLanesOnOperand(const MachineOperand &MO, diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp index 4d668c53f7156..6732a814f5440 100644 --- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp +++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp @@ -89,8 +89,9 @@ INITIALIZE_PASS_END(FixupStatepointCallerSaved, DEBUG_TYPE, "Fixup Statepoint Caller Saved", false, false) // Utility function to get size of the register. -static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg) { - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); +static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg, + const MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg, MRI); return TRI.getSpillSize(*RC); } @@ -124,6 +125,7 @@ static Register performCopyPropagation(Register Reg, MachineBasicBlock *MBB = RI->getParent(); MachineBasicBlock::reverse_iterator E = MBB->rend(); MachineInstr *Def = nullptr, *Use = nullptr; + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); for (auto It = ++(RI.getReverse()); It != E; ++It) { if (It->readsRegister(Reg, &TRI) && !Use) Use = &*It; @@ -142,7 +144,7 @@ static Register performCopyPropagation(Register Reg, Register SrcReg = DestSrc->Source->getReg(); - if (getRegisterSize(TRI, Reg) != getRegisterSize(TRI, SrcReg)) + if (getRegisterSize(TRI, Reg, MRI) != getRegisterSize(TRI, SrcReg, MRI)) return Reg; LLVM_DEBUG(dbgs() << "spillRegisters: perform copy propagation " @@ -209,6 +211,7 @@ class FrameIndexesCache { }; MachineFrameInfo &MFI; const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; // Map size to list of frame indexes of this size. If the mode is // FixupSCSExtendSlotSize then the key 0 is used to keep all frame indexes. // If the size of required spill slot is greater than in a cache then the @@ -232,8 +235,9 @@ class FrameIndexesCache { } public: - FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI) - : MFI(MFI), TRI(TRI) {} + FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) + : MFI(MFI), TRI(TRI), MRI(MRI) {} // Reset the current state of used frame indexes. After invocation of // this function all frame indexes are available for allocation with // the exception of slots reserved for landing pad processing (if any). @@ -265,7 +269,7 @@ class FrameIndexesCache { } } - unsigned Size = getRegisterSize(TRI, Reg); + unsigned Size = getRegisterSize(TRI, Reg, MRI); FrameIndexesPerSize &Line = getCacheBucket(Size); while (Line.Index < Line.Slots.size()) { int FI = Line.Slots[Line.Index++]; @@ -299,11 +303,12 @@ class FrameIndexesCache { // Sort all registers to spill in descendent order. In the // FixupSCSExtendSlotSize mode it will minimize the total frame size. // In non FixupSCSExtendSlotSize mode we can skip this step. - void sortRegisters(SmallVectorImpl &Regs) { + void sortRegisters(SmallVectorImpl &Regs, + const MachineRegisterInfo &MRI) { if (!FixupSCSExtendSlotSize) return; llvm::sort(Regs, [&](Register &A, Register &B) { - return getRegisterSize(TRI, A) > getRegisterSize(TRI, B); + return getRegisterSize(TRI, A, MRI) > getRegisterSize(TRI, B, MRI); }); } }; @@ -398,7 +403,7 @@ class StatepointState { RegsToSpill.push_back(Reg); OpsToSpill.push_back(Idx); } - CacheFI.sortRegisters(RegsToSpill); + CacheFI.sortRegisters(RegsToSpill, MF.getRegInfo()); return !RegsToSpill.empty(); } @@ -418,7 +423,8 @@ class StatepointState { bool IsKill = true; MachineBasicBlock::iterator InsertBefore(MI); Reg = performCopyPropagation(Reg, InsertBefore, IsKill, TII, TRI); - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI.getMinimalPhysRegClass(Reg, MF.getRegInfo()); LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore); TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI, @@ -428,7 +434,8 @@ class StatepointState { void insertReloadBefore(unsigned Reg, MachineBasicBlock::iterator It, MachineBasicBlock *MBB) { - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI.getMinimalPhysRegClass(Reg, MF.getRegInfo()); int FI = RegToSlotIdx[Reg]; if (It != MBB->end()) { TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register()); @@ -519,7 +526,7 @@ class StatepointState { if (I == OpsToSpill[CurOpIdx]) { int FI = RegToSlotIdx[MO.getReg()]; MIB.addImm(StackMaps::IndirectMemRefOp); - MIB.addImm(getRegisterSize(TRI, MO.getReg())); + MIB.addImm(getRegisterSize(TRI, MO.getReg(), MF.getRegInfo())); assert(MO.isReg() && "Should be register"); assert(MO.getReg().isPhysical() && "Should be physical register"); MIB.addFrameIndex(FI); @@ -538,6 +545,7 @@ class StatepointState { assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed"); // Add mem operands. NewMI->setMemRefs(MF, MI.memoperands()); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto It : RegToSlotIdx) { Register R = It.first; int FrameIndex = It.second; @@ -546,7 +554,7 @@ class StatepointState { if (is_contained(RegsToReload, R)) Flags |= MachineMemOperand::MOStore; auto *MMO = - MF.getMachineMemOperand(PtrInfo, Flags, getRegisterSize(TRI, R), + MF.getMachineMemOperand(PtrInfo, Flags, getRegisterSize(TRI, R, MRI), MFI.getObjectAlign(FrameIndex)); NewMI->addMemOperand(MF, MMO); } @@ -570,7 +578,7 @@ class StatepointProcessor { public: StatepointProcessor(MachineFunction &MF) : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()), - CacheFI(MF.getFrameInfo(), TRI) {} + CacheFI(MF.getFrameInfo(), TRI, MF.getRegInfo()) {} bool process(MachineInstr &MI, bool AllowGCPtrInCSR) { StatepointOpers SO(&MI); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index a9fa73b60a097..8d597ceb0a043 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -124,10 +124,10 @@ Register llvm::constrainOperandRegClass( // register types (E.g., AMDGPU's VGPR and AGPR). The regbank ambiguity // resolved by targets during regbankselect should not be overridden. if (const auto *SubRC = TRI.getCommonSubClass( - OpRC, TRI.getConstrainedRegClassForOperand(RegMO, MRI))) + OpRC, TRI.getConstrainedRegClassForOperand(RegMO, MRI), MRI)) OpRC = SubRC; - OpRC = TRI.getAllocatableClass(OpRC); + OpRC = TRI.getAllocatableClass(OpRC, MRI); } if (!OpRC) { diff --git a/llvm/lib/CodeGen/LiveStacks.cpp b/llvm/lib/CodeGen/LiveStacks.cpp index 8fc5a929d77b2..4290d511dd1c2 100644 --- a/llvm/lib/CodeGen/LiveStacks.cpp +++ b/llvm/lib/CodeGen/LiveStacks.cpp @@ -45,6 +45,7 @@ void LiveStacks::releaseMemory() { bool LiveStacks::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); // FIXME: No analysis is being done right now. We are relying on the // register allocators to provide the information. return false; @@ -64,7 +65,7 @@ LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) { } else { // Use the largest common subclass register class. const TargetRegisterClass *OldRC = S2RCMap[Slot]; - S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC); + S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC, *MRI); } return I->second; } diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index e09318a486955..27e72ccc74052 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -687,7 +687,7 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, Error = true; break; case VRegInfo::NORMAL: - if (!Info.D.RC->isAllocatable()) { + if (!Info.D.RC->isAllocatable() || Info.D.RC->isSynthetic()) { error(Twine("Cannot use non-allocatable class '") + TRI->getRegClassName(Info.D.RC) + "' for virtual register " + Name + " in function '" + MF.getName() + "'"); diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 26a8d00e66265..44ade0d87159a 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -196,7 +196,7 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, // cse-add-with-overflow.ll). This can be done here as follows: // if (SrcSubReg) // RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC, - // SrcSubReg); + // SrcSubReg, MRI); // MO.substVirtReg(SrcReg, SrcSubReg, *TRI); // // The 2-addr pass has been updated to handle coalesced subregs. However, diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index a4c87a7678bd8..703af75ac5c18 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -175,7 +175,7 @@ bool MachineCombiner::isTransientMI(const MachineInstr *MI) { auto SrcSub = MI->getOperand(1).getSubReg(); auto SrcRC = MRI->getRegClass(Src); auto DstRC = MRI->getRegClass(Dst); - return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr; + return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub, *MRI) != nullptr; } if (Src.isPhysical() && Dst.isPhysical()) diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index fe2f9ccd33a33..79f3873c85852 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -940,36 +940,40 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx, const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg( Register Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI, bool ExploreBundle) const { + const TargetRegisterInfo *TRI, const MachineRegisterInfo &MRI, + bool ExploreBundle) const { // Check every operands inside the bundle if we have // been asked to. if (ExploreBundle) for (ConstMIBundleOperands OpndIt(*this); OpndIt.isValid() && CurRC; ++OpndIt) CurRC = OpndIt->getParent()->getRegClassConstraintEffectForVRegImpl( - OpndIt.getOperandNo(), Reg, CurRC, TII, TRI); + OpndIt.getOperandNo(), Reg, CurRC, TII, TRI, MRI); else // Otherwise, just check the current operands. for (unsigned i = 0, e = NumOperands; i < e && CurRC; ++i) - CurRC = getRegClassConstraintEffectForVRegImpl(i, Reg, CurRC, TII, TRI); + CurRC = + getRegClassConstraintEffectForVRegImpl(i, Reg, CurRC, TII, TRI, MRI); return CurRC; } const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl( unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC, - const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const { + const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const { assert(CurRC && "Invalid initial register class"); // Check if Reg is constrained by some of its use/def from MI. const MachineOperand &MO = getOperand(OpIdx); if (!MO.isReg() || MO.getReg() != Reg) return CurRC; // If yes, accumulate the constraints through the operand. - return getRegClassConstraintEffect(OpIdx, CurRC, TII, TRI); + return getRegClassConstraintEffect(OpIdx, CurRC, TII, TRI, MRI); } const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect( unsigned OpIdx, const TargetRegisterClass *CurRC, - const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const { + const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const { const TargetRegisterClass *OpRC = getRegClassConstraint(OpIdx, TII, TRI); const MachineOperand &MO = getOperand(OpIdx); assert(MO.isReg() && @@ -977,11 +981,11 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect( assert(CurRC && "Invalid initial register class"); if (unsigned SubIdx = MO.getSubReg()) { if (OpRC) - CurRC = TRI->getMatchingSuperRegClass(CurRC, OpRC, SubIdx); + CurRC = TRI->getMatchingSuperRegClass(CurRC, OpRC, SubIdx, MRI); else CurRC = TRI->getSubClassWithSubReg(CurRC, SubIdx); } else if (OpRC) - CurRC = TRI->getCommonSubClass(CurRC, OpRC); + CurRC = TRI->getCommonSubClass(CurRC, OpRC, MRI); return CurRC; } diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index b0c1838b3ff0e..2be50214e3ad8 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -48,6 +48,8 @@ MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) RegAllocHints.reserve(256); UsedPhysRegMask.resize(NumRegs); PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]()); + initializeRegClassSyntheticInfo(); + RegClassAllocationMasks.clear(); TheDelegates.clear(); } @@ -55,7 +57,8 @@ MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) /// void MachineRegisterInfo::setRegClass(Register Reg, const TargetRegisterClass *RC) { - assert(RC && RC->isAllocatable() && "Invalid RC for virtual register"); + assert(RC && isEnabled(RC) && RC->isAllocatable() && + "Invalid RC for virtual register"); VRegInfo[Reg].first = RC; } @@ -71,7 +74,7 @@ constrainRegClass(MachineRegisterInfo &MRI, Register Reg, if (OldRC == RC) return RC; const TargetRegisterClass *NewRC = - MRI.getTargetRegisterInfo()->getCommonSubClass(OldRC, RC); + MRI.getTargetRegisterInfo()->getCommonSubClass(OldRC, RC, MRI); if (!NewRC || NewRC == OldRC) return NewRC; if (NewRC->getNumRegs() < MinNumRegs) @@ -134,7 +137,7 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) { MachineInstr *MI = MO.getParent(); unsigned OpNo = &MO - &MI->getOperand(0); NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, - getTargetRegisterInfo()); + getTargetRegisterInfo(), *this); if (!NewRC || NewRC == OldRC) return false; } @@ -159,6 +162,8 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass, assert(RegClass && "Cannot create register without RegClass!"); assert(RegClass->isAllocatable() && "Virtual register RegClass must be allocatable."); + assert(isEnabled(RegClass) && + "RegClass must be enabled first to create its virtual registers."); // New virtual register number. Register Reg = createIncompleteVirtualRegister(Name); @@ -650,6 +655,43 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef CSRs) { IsUpdatedCSRsInitialized = true; } +void MachineRegisterInfo::initializeRegClassSyntheticInfo() { + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + + RegClassSyntheticInfo.resize(TRI->getNumRegClasses()); + for (const TargetRegisterClass *RC : TRI->regclasses()) { + if (RC->isSynthetic()) + RegClassSyntheticInfo.set(RC->getID()); + } +} + +void MachineRegisterInfo::changeSyntheticInfoForRC( + const TargetRegisterClass *RC, bool Value) { + assert(RC->isSynthetic() && "Regclasses can be enabled/disabled dynamically " + "only if marked synthetic."); + + if (Value) + RegClassSyntheticInfo.set(RC->getID()); + else + RegClassSyntheticInfo.reset(RC->getID()); +} + +bool MachineRegisterInfo::isEnabled(const TargetRegisterClass *RC) const { + return !RegClassSyntheticInfo.test(RC->getID()); +} + +void MachineRegisterInfo::updateAllocationMaskForRCs( + std::vector &&Mask) { + unsigned Size = Mask.size(); + for (unsigned I = 0; I < Size; ++I) + RegClassAllocationMasks.push_back(std::move(Mask[I])); +} + +const BitVector &MachineRegisterInfo::getAllocationMaskForRC( + const TargetRegisterClass &RC) const { + return RegClassAllocationMasks[RC.getID()]; +} + bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c69d36fc7fdd6..a33653160cb69 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2109,14 +2109,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { TypeSize DstSize = TRI->getRegSizeInBits(DstReg, *MRI); if (SrcReg.isPhysical() && DstTy.isValid()) { const TargetRegisterClass *SrcRC = - TRI->getMinimalPhysRegClassLLT(SrcReg, DstTy); + TRI->getMinimalPhysRegClassLLT(SrcReg, *MRI, DstTy); if (SrcRC) SrcSize = TRI->getRegSizeInBits(*SrcRC); } if (DstReg.isPhysical() && SrcTy.isValid()) { const TargetRegisterClass *DstRC = - TRI->getMinimalPhysRegClassLLT(DstReg, SrcTy); + TRI->getMinimalPhysRegClassLLT(DstReg, *MRI, SrcTy); if (DstRC) DstSize = TRI->getRegSizeInBits(*DstRC); } @@ -2490,7 +2490,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("No largest legal super class exists.", MO, MONum); return; } - DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx); + DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx, *MRI); if (!DRC) { report("No matching super-reg register class.", MO, MONum); return; diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 1b1f22e827cb1..d5809fa46939b 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -793,7 +793,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg, // Keep following the chain if the value isn't any better yet. const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); if (!TRI->shouldRewriteCopySrc(DefRC, RegSubReg.SubReg, SrcRC, - CurSrcPair.SubReg)) + CurSrcPair.SubReg, *MRI)) continue; // We currently cannot deal with subreg operands on PHI instructions diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index eaf96ec5cbde8..77271140cb7fa 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -477,7 +477,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, continue; unsigned Reg = CS.getReg(); - const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + RegInfo->getMinimalPhysRegClass(Reg, F.getRegInfo()); int FrameIdx; if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { @@ -607,7 +608,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, TII.get(TargetOpcode::COPY), CS.getDstReg()) .addReg(Reg, getKillRegState(true)); } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MF.getRegInfo()); TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, TRI, Register()); } @@ -634,7 +636,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg) .addReg(CI.getDstReg(), getKillRegState(true)); } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MF.getRegInfo()); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI, Register()); assert(I != RestoreBlock.begin() && diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index a208bf89fadf2..decf7b6cda14a 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1342,8 +1342,9 @@ static unsigned getNumAllocatableRegsForConstraints( assert(SuperRC && "Invalid register class"); const TargetRegisterClass *ConstrainedRC = - MI->getRegClassConstraintEffectForVReg(Reg, SuperRC, TII, TRI, - /* ExploreBundle */ true); + MI->getRegClassConstraintEffectForVReg( + Reg, SuperRC, TII, TRI, MI->getParent()->getParent()->getRegInfo(), + /* ExploreBundle */ true); if (!ConstrainedRC) return 0; return RCI.getNumAllocatableRegs(ConstrainedRC); diff --git a/llvm/lib/CodeGen/RegisterBank.cpp b/llvm/lib/CodeGen/RegisterBank.cpp index bdc6df78fd3d9..ac4ef3344e7eb 100644 --- a/llvm/lib/CodeGen/RegisterBank.cpp +++ b/llvm/lib/CodeGen/RegisterBank.cpp @@ -36,7 +36,7 @@ bool RegisterBank::verify(const RegisterBankInfo &RBI, for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) { const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId); - if (!RC.hasSubClassEq(&SubRC)) + if (SubRC.isSynthetic() || !RC.hasSubClassEq(&SubRC)) continue; // Verify that the Size of the register bank is big enough to cover @@ -91,7 +91,7 @@ void RegisterBank::print(raw_ostream &OS, bool IsForDebug, for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) { const TargetRegisterClass &RC = *TRI->getRegClass(RCId); - if (covers(RC)) + if (covers(RC) && !RC.isSynthetic()) OS << LS << TRI->getRegClassName(&RC); } } diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp index 5548430d1b0ae..b57a19dd4ad85 100644 --- a/llvm/lib/CodeGen/RegisterBankInfo.cpp +++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp @@ -86,7 +86,7 @@ RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI, if (!Reg.isVirtual()) { // FIXME: This was probably a copy to a virtual register that does have a // type we could use. - const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg, TRI); + const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg, TRI, MRI); return RC ? &getRegBankFromRegClass(*RC, LLT()) : nullptr; } @@ -101,12 +101,14 @@ RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterClass * RegisterBankInfo::getMinimalPhysRegClass(Register Reg, - const TargetRegisterInfo &TRI) const { + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) const { assert(Reg.isPhysical() && "Reg must be a physreg"); const auto &RegRCIt = PhysRegMinimalRCs.find(Reg); if (RegRCIt != PhysRegMinimalRCs.end()) return RegRCIt->second; - const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClassLLT(Reg, LLT()); + const TargetRegisterClass *PhysRC = + TRI.getMinimalPhysRegClassLLT(Reg, MRI, LLT()); PhysRegMinimalRCs[Reg] = PhysRC; return PhysRC; } @@ -503,7 +505,7 @@ TypeSize RegisterBankInfo::getSizeInBits(Register Reg, // Instead, we need to access a register class that contains Reg and // get the size of that register class. // Because this is expensive, we'll cache the register class by calling - auto *RC = getMinimalPhysRegClass(Reg, TRI); + auto *RC = getMinimalPhysRegClass(Reg, TRI, MRI); assert(RC && "Expecting Register class"); return TRI.getRegSizeInBits(*RC); } diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 9312bc03bc522..a373aed30b875 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -142,9 +142,18 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { // FIXME: Once targets reserve registers instead of removing them from the // allocation order, we can simply use begin/end here. ArrayRef RawOrder = RC->getRawAllocationOrder(*MF); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + // Apply the allocation mask for the regclass if found one. Another level of + // restriction based on target constraints before getting the actual + // allocation order. + BitVector ReservedForRC(TRI->getNumRegs(), false); + ReservedForRC |= Reserved; + if (MRI.hasAllocationMaskForRCs()) + ReservedForRC |= MRI.getAllocationMaskForRC(*RC); + for (unsigned PhysReg : RawOrder) { // Remove reserved registers from the allocation order. - if (Reserved.test(PhysReg)) + if (ReservedForRC.test(PhysReg)) continue; uint8_t Cost = RegCosts[PhysReg]; MinCost = std::min(MinCost, Cost); diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 7e9c992031f8d..ff300430d38d0 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -498,21 +498,21 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { if (Src == Dst && SrcSub != DstSub) return false; - NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, - SrcIdx, DstIdx); + NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, SrcIdx, + DstIdx, MRI); if (!NewRC) return false; } else if (DstSub) { // SrcReg will be merged with a sub-register of DstReg. SrcIdx = DstSub; - NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub); + NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub, MRI); } else if (SrcSub) { // DstReg will be merged with a sub-register of SrcReg. DstIdx = SrcSub; - NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub); + NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub, MRI); } else { // This is a straight copy without sub-registers. - NewRC = TRI.getCommonSubClass(DstRC, SrcRC); + NewRC = TRI.getCommonSubClass(DstRC, SrcRC, MRI); } // The combined constraint may be impossible to satisfy. @@ -1387,7 +1387,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, && "Shouldn't have SrcIdx+DstIdx at this point"); const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); const TargetRegisterClass *CommonRC = - TRI->getCommonSubClass(DefRC, DstRC); + TRI->getCommonSubClass(DefRC, DstRC, *MRI); if (CommonRC != nullptr) { NewRC = CommonRC; @@ -1481,9 +1481,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (DefRC != nullptr) { if (NewIdx) - NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx); + NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx, *MRI); else - NewRC = TRI->getCommonSubClass(NewRC, DefRC); + NewRC = TRI->getCommonSubClass(NewRC, DefRC, *MRI); assert(NewRC && "subreg chosen for remat incompatible with instruction"); } // Remap subranges to new lanemask and change register class. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f199625bf67ad..a8a4286c26230 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19094,7 +19094,8 @@ struct LoadedSlice { const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); // Assume bitcasts are cheap, unless both register classes do not // explicitly share a common sub class. - if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) + if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC, + DAG->getMachineFunction().getRegInfo())) return false; // Check if it will be merged with the load. diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 54409cbf91f1f..a557406b236fb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -131,13 +131,13 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, const TargetRegisterClass *RC = nullptr; if (i + II.getNumDefs() < II.getNumOperands()) { RC = TRI->getAllocatableClass( - TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF)); + TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF), *MRI); } if (!UseRC) UseRC = RC; else if (RC) { const TargetRegisterClass *ComRC = - TRI->getCommonSubClass(UseRC, RC); + TRI->getCommonSubClass(UseRC, RC, *MRI); // If multiple uses expect disjoint register classes, we emit // copies in AddRegisterOperand. if (ComRC) @@ -152,7 +152,7 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, } const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr; - SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT); + SrcRC = TRI->getMinimalPhysRegClass(SrcReg, *MRI, VT); // Figure out the register class to create for the destreg. if (VRBase) { @@ -203,7 +203,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, // register instead of creating a new vreg. Register VRBase; const TargetRegisterClass *RC = - TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF)); + TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF), *MRI); // Always let the value type influence the used register class. The // constraints on the instruction may be too lax to represent the value // type correctly. For example, a 64-bit float (X86::FR64) can't live in @@ -213,7 +213,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, Node->getSimpleValueType(i), (Node->isDivergent() || (RC && TRI->isDivergentRegClass(RC)))); if (RC) - VTRC = TRI->getCommonSubClass(RC, VTRC); + VTRC = TRI->getCommonSubClass(RC, VTRC, *MRI); if (VTRC) RC = VTRC; } @@ -350,7 +350,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, const TargetRegisterClass *ConstrainedRC = MRI->constrainRegClass(VReg, OpRC, MinNumRegs); if (!ConstrainedRC) { - OpRC = TRI->getAllocatableClass(OpRC); + OpRC = TRI->getAllocatableClass(OpRC, *MRI); assert(OpRC && "Constraints cannot be fulfilled for allocation"); Register NewVReg = MRI->createVirtualRegister(OpRC); BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(), @@ -412,7 +412,8 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, Register VReg = R->getReg(); MVT OpVT = Op.getSimpleValueType(); const TargetRegisterClass *IIRC = - II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF)) + II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF), + *MRI) : nullptr; const TargetRegisterClass *OpRC = TLI->isTypeLegal(OpVT) @@ -640,7 +641,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node, // Create the new VReg in the destination class and emit a copy. unsigned DstRCIdx = Node->getConstantOperandVal(1); const TargetRegisterClass *DstRC = - TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx)); + TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx), *MRI); Register NewVReg = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg); @@ -658,7 +659,8 @@ void InstrEmitter::EmitRegSequence(SDNode *Node, bool IsClone, bool IsCloned) { unsigned DstRCIdx = Node->getConstantOperandVal(0); const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx); - Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC)); + Register NewVReg = + MRI->createVirtualRegister(TRI->getAllocatableClass(RC, *MRI)); const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE); MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg); unsigned NumOps = Node->getNumOperands(); @@ -681,7 +683,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node, unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap); const TargetRegisterClass *TRC = MRI->getRegClass(SubReg); const TargetRegisterClass *SRC = - TRI->getMatchingSuperRegClass(RC, TRC, SubIdx); + TRI->getMatchingSuperRegClass(RC, TRC, SubIdx, *MRI); if (SRC && SRC != RC) { MRI->setRegClass(NewVReg, SRC); RC = SRC; diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index e3acb58327a8c..f6124201775d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -581,7 +581,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() { SUnit *LRDef = LiveRegDefs[Reg]; MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII); const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, VT); + TRI->getMinimalPhysRegClass(Reg, MRI, VT); const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC); // If cross copy register class is the same as RC, then it must be diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index e4ee3fd99f16e..699fab5333e5c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1561,8 +1561,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() { unsigned Reg = LRegs[0]; SUnit *LRDef = LiveRegDefs[Reg]; MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII); - const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, VT); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI, VT); const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC); // If cross copy register class is the same as RC, then it must be possible diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index c9e2745f00c95..07728bd3b4354 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -111,12 +111,14 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, const TargetLowering &TLI, + const MachineRegisterInfo &MRI, unsigned &PhysReg, int &Cost) { if (Op != 2 || User->getOpcode() != ISD::CopyToReg) return; unsigned Reg = cast(User->getOperand(1))->getReg(); - if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost)) + if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, MRI, TII, PhysReg, + Cost)) return; if (Register::isVirtualRegister(Reg)) @@ -134,7 +136,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, if (PhysReg != 0) { const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo)); + TRI->getMinimalPhysRegClass(Reg, MRI, Def->getSimpleValueType(ResNo)); Cost = RC->getCopyCost(); } } @@ -490,7 +492,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() { int Cost = 1; // Determine if this is a physical register dependency. const TargetLowering &TLI = DAG->getTargetLoweringInfo(); - CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost); + CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, MRI, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2d63774c75e37..982d1a0c44f77 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9738,9 +9738,11 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, Register TiedReg = R->getReg(); MVT RegVT = R->getSimpleValueType(0); const TargetRegisterClass *RC = - TiedReg.isVirtual() ? MRI.getRegClass(TiedReg) - : RegVT != MVT::Untyped ? TLI.getRegClassFor(RegVT) - : TRI.getMinimalPhysRegClass(TiedReg); + TiedReg.isVirtual() + ? MRI.getRegClass(TiedReg) + : RegVT != MVT::Untyped + ? TLI.getRegClassFor(RegVT) + : TRI.getMinimalPhysRegClass(TiedReg, MRI); for (unsigned i = 0, e = Flag.getNumOperandRegisters(); i != e; ++i) Regs.push_back(MRI.createVirtualRegister(RC)); diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp index 90aa93e442cf3..eca640ef9ad64 100644 --- a/llvm/lib/CodeGen/StackMaps.cpp +++ b/llvm/lib/CodeGen/StackMaps.cpp @@ -277,7 +277,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI, assert(MOI->getReg().isPhysical() && "Virtreg operands should have been rewritten before now."); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg()); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(MOI->getReg(), AP.MF->getRegInfo()); assert(!MOI->getSubReg() && "Physical subreg still around."); unsigned Offset = 0; @@ -373,7 +374,8 @@ void StackMaps::print(raw_ostream &OS) { StackMaps::LiveOutReg StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const { unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI); - unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg)); + unsigned Size = + TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg, AP.MF->getRegInfo())); return LiveOutReg(Reg, DwarfRegNum, Size); } diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 5ed67bd0a121e..e42c4f1348ab7 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -417,7 +417,7 @@ void TailDuplicator::duplicateInstruction( const TargetRegisterClass *ConstrRC; if (VI->second.SubReg != 0) { ConstrRC = TRI->getMatchingSuperRegClass(MappedRC, OrigRC, - VI->second.SubReg); + VI->second.SubReg, *MRI); if (ConstrRC) { // The actual constraining (as in "find appropriate new class") // is done by getMatchingSuperRegClass, so now we only need to diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9990556f89ed8..9adfa8f839104 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1365,7 +1365,8 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI, for (unsigned i : SuperRegRC.set_bits()) { const TargetRegisterClass *SuperRC = TRI->getRegClass(i); // We want the largest possible spill size. - if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC)) + if (SuperRC->isSynthetic() || + (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC))) continue; if (!isLegalRC(*TRI, *SuperRC)) continue; diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index c9503fcb77bb2..7c24748acdd93 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -193,14 +193,15 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo, /// getAllocatableClass - Return the maximal subclass of the given register /// class that is alloctable, or NULL. const TargetRegisterClass * -TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const { +TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC, + const MachineRegisterInfo &MRI) const { if (!RC || RC->isAllocatable()) return RC; for (BitMaskClassIterator It(RC->getSubClassMask(), *this); It.isValid(); ++It) { const TargetRegisterClass *SubRC = getRegClass(It.getID()); - if (SubRC->isAllocatable()) + if (SubRC->isAllocatable() && MRI.isEnabled(SubRC)) return SubRC; } return nullptr; @@ -209,8 +210,8 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const { /// getMinimalPhysRegClass - Returns the Register Class of a physical /// register of the given type, picking the most sub register class of /// the right type that contains this physreg. -const TargetRegisterClass * -TargetRegisterInfo::getMinimalPhysRegClass(MCRegister reg, MVT VT) const { +const TargetRegisterClass *TargetRegisterInfo::getMinimalPhysRegClass( + MCRegister reg, const MachineRegisterInfo &MRI, MVT VT) const { assert(Register::isPhysicalRegister(reg) && "reg must be a physical register"); @@ -219,7 +220,8 @@ TargetRegisterInfo::getMinimalPhysRegClass(MCRegister reg, MVT VT) const { const TargetRegisterClass* BestRC = nullptr; for (const TargetRegisterClass* RC : regclasses()) { if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) && - RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC))) + RC->contains(reg) && MRI.isEnabled(RC) && + (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; } @@ -227,8 +229,8 @@ TargetRegisterInfo::getMinimalPhysRegClass(MCRegister reg, MVT VT) const { return BestRC; } -const TargetRegisterClass * -TargetRegisterInfo::getMinimalPhysRegClassLLT(MCRegister reg, LLT Ty) const { +const TargetRegisterClass *TargetRegisterInfo::getMinimalPhysRegClassLLT( + MCRegister reg, const MachineRegisterInfo &MRI, LLT Ty) const { assert(Register::isPhysicalRegister(reg) && "reg must be a physical register"); @@ -237,7 +239,7 @@ TargetRegisterInfo::getMinimalPhysRegClassLLT(MCRegister reg, LLT Ty) const { const TargetRegisterClass *BestRC = nullptr; for (const TargetRegisterClass *RC : regclasses()) { if ((!Ty.isValid() || isTypeLegalForClass(*RC, Ty)) && RC->contains(reg) && - (!BestRC || BestRC->hasSubClass(RC))) + MRI.isEnabled(RC) && (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; } @@ -248,7 +250,8 @@ TargetRegisterInfo::getMinimalPhysRegClassLLT(MCRegister reg, LLT Ty) const { /// registers for the specific register class. static void getAllocatableSetForRC(const MachineFunction &MF, const TargetRegisterClass *RC, BitVector &R){ - assert(RC->isAllocatable() && "invalid for nonallocatable sets"); + assert(RC->isAllocatable() && MF.getRegInfo().isEnabled(RC) && + "invalid for nonallocatable sets"); ArrayRef Order = RC->getRawAllocationOrder(MF); for (MCPhysReg PR : Order) R.set(PR); @@ -257,38 +260,44 @@ static void getAllocatableSetForRC(const MachineFunction &MF, BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, const TargetRegisterClass *RC) const { BitVector Allocatable(getNumRegs()); + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (RC) { // A register class with no allocatable subclass returns an empty set. - const TargetRegisterClass *SubClass = getAllocatableClass(RC); + const TargetRegisterClass *SubClass = getAllocatableClass(RC, MRI); if (SubClass) getAllocatableSetForRC(MF, SubClass, Allocatable); } else { for (const TargetRegisterClass *C : regclasses()) - if (C->isAllocatable()) + if (C->isAllocatable() && MRI.isEnabled(C)) getAllocatableSetForRC(MF, C, Allocatable); } // Mask out the reserved registers - const MachineRegisterInfo &MRI = MF.getRegInfo(); const BitVector &Reserved = MRI.getReservedRegs(); Allocatable.reset(Reserved); return Allocatable; } -static inline -const TargetRegisterClass *firstCommonClass(const uint32_t *A, - const uint32_t *B, - const TargetRegisterInfo *TRI) { - for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32) - if (unsigned Common = *A++ & *B++) - return TRI->getRegClass(I + llvm::countr_zero(Common)); +static inline const TargetRegisterClass * +firstCommonClass(const uint32_t *A, const uint32_t *B, + const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI) { + for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32) { + if (unsigned Common = *A++ & *B++) { + const TargetRegisterClass *RC = + TRI->getRegClass(I + llvm::countr_zero(Common)); + if (MRI.isEnabled(RC)) + return RC; + } + } return nullptr; } const TargetRegisterClass * TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, - const TargetRegisterClass *B) const { + const TargetRegisterClass *B, + const MachineRegisterInfo &MRI) const { // First take care of the trivial cases. if (A == B) return A; @@ -297,13 +306,13 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, // Register classes are ordered topologically, so the largest common // sub-class it the common sub-class with the smallest ID. - return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this); + return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, + MRI); } -const TargetRegisterClass * -TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, - const TargetRegisterClass *B, - unsigned Idx) const { +const TargetRegisterClass *TargetRegisterInfo::getMatchingSuperRegClass( + const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx, + const MachineRegisterInfo &MRI) const { assert(A && B && "Missing register class"); assert(Idx && "Bad sub-register index"); @@ -312,14 +321,14 @@ TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, if (RCI.getSubReg() == Idx) // The bit mask contains all register classes that are projected into B // by Idx. Find a class that is also a sub-class of A. - return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this); + return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this, MRI); return nullptr; } -const TargetRegisterClass *TargetRegisterInfo:: -getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, - const TargetRegisterClass *RCB, unsigned SubB, - unsigned &PreA, unsigned &PreB) const { +const TargetRegisterClass *TargetRegisterInfo::getCommonSuperRegClass( + const TargetRegisterClass *RCA, unsigned SubA, + const TargetRegisterClass *RCB, unsigned SubB, unsigned &PreA, + unsigned &PreB, const MachineRegisterInfo &MRI) const { assert(RCA && SubA && RCB && SubB && "Invalid arguments"); // Search all pairs of sub-register indices that project into RCA and RCB @@ -352,7 +361,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) { // Check if a common super-register class exists for this index pair. const TargetRegisterClass *RC = - firstCommonClass(IA.getMask(), IB.getMask(), this); + firstCommonClass(IA.getMask(), IB.getMask(), this, MRI); if (!RC || getRegSizeInBits(*RC) < MinSize) continue; @@ -384,7 +393,8 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) { + unsigned SrcSubReg, + const MachineRegisterInfo &MRI) { // Same register class. if (DefRC == SrcRC) return true; @@ -393,7 +403,7 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, unsigned SrcIdx, DefIdx; if (SrcSubReg && DefSubReg) { return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, - SrcIdx, DefIdx) != nullptr; + SrcIdx, DefIdx, MRI) != nullptr; } // At most one of the register is a sub register, make it Src to avoid @@ -405,18 +415,19 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, // One of the register is a sub register, check if we can get a superclass. if (SrcSubReg) - return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; + return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg, MRI) != + nullptr; // Plain copy. - return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; + return TRI.getCommonSubClass(DefRC, SrcRC, MRI) != nullptr; } -bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const { +bool TargetRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, unsigned DefSubReg, + const TargetRegisterClass *SrcRC, unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const { // If this source does not incur a cross register bank copy, use it. - return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg); + return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg, MRI); } // Compute target-independent register allocator hints to help eliminate copies. @@ -507,7 +518,7 @@ TargetRegisterInfo::getRegSizeInBits(Register Reg, // The size is not directly available for physical registers. // Instead, we need to access a register class that contains Reg and // get the size of that register class. - RC = getMinimalPhysRegClass(Reg); + RC = getMinimalPhysRegClass(Reg, MRI); assert(RC && "Unable to deduce the register class"); return getRegSizeInBits(*RC); } diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index ebacbc420f858..f38186a892f0e 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1328,9 +1328,8 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, if (UnfoldMCID.getNumDefs() == 1) { // Unfold the load. LLVM_DEBUG(dbgs() << "2addr: UNFOLDING: " << MI); - const TargetRegisterClass *RC = - TRI->getAllocatableClass( - TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF)); + const TargetRegisterClass *RC = TRI->getAllocatableClass( + TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF), *MRI); Register Reg = MRI->createVirtualRegister(RC); SmallVector NewMIs; if (!TII->unfoldMemoryOperand(*MF, MI, Reg, @@ -1531,7 +1530,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, if (SubRegB) { if (RegA.isVirtual()) { assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA), - SubRegB) && + SubRegB, *MRI) && "tied subregister must be a truncation"); // The superreg class will not be used to constrain the subreg class. RC = nullptr; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 5cc612e89162a..f2090ffb04d84 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3451,7 +3451,8 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( for (auto &CS : CSI) { Register Reg = CS.getReg(); - const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + RegInfo->getMinimalPhysRegClass(Reg, MF.getRegInfo()); unsigned Size = RegInfo->getSpillSize(*RC); Align Alignment(RegInfo->getSpillAlign(*RC)); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 02943b8a4ab15..c8cd27d247374 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -703,15 +703,15 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, int &FalseCycles) const { // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + const TargetRegisterClass *RC = RI.getCommonSubClass( + MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg), MRI); if (!RC) return false; // Also need to check the dest regclass, in case we're trying to optimize // something like: // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 - if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) + if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg), MRI)) return false; // Expanding cbz/tbz requires an extra cycle of latency on the condition. @@ -5557,8 +5557,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // This is slightly expensive to compute for physical regs since // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { - return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) - : TRI.getMinimalPhysRegClass(Reg); + return Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : TRI.getMinimalPhysRegClass(Reg, MRI); }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index d0adb78b231a7..2583acd67ee1a 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -879,7 +879,6 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (RenameReg) { MCRegister RegToRename = getLdStRegOp(*I).getReg(); DefinedInBB.addReg(*RenameReg); - // Return the sub/super register for RenameReg, matching the size of // OriginalReg. auto GetMatchingSubReg = @@ -895,6 +894,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, std::function UpdateMIs = [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI, bool IsDef) { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); if (IsDef) { bool SeenDef = false; for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { @@ -916,7 +916,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (!isRewritableImplicitDef(MI.getOpcode())) continue; MatchingReg = GetMatchingSubReg( - TRI->getMinimalPhysRegClass(MOP.getReg())); + TRI->getMinimalPhysRegClass(MOP.getReg(), MRI)); } MOP.setReg(MatchingReg); SeenDef = true; @@ -936,7 +936,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MatchingReg = GetMatchingSubReg(RC); else MatchingReg = GetMatchingSubReg( - TRI->getMinimalPhysRegClass(MOP.getReg())); + TRI->getMinimalPhysRegClass(MOP.getReg(), MRI)); assert(MatchingReg != AArch64::NoRegister && "Cannot find matching regs for renaming"); MOP.setReg(MatchingReg); @@ -1422,7 +1422,8 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, static bool canRenameMOP(const MachineOperand &MOP, const TargetRegisterInfo *TRI) { if (MOP.isReg()) { - auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); + const MachineRegisterInfo &MRI = MOP.getParent()->getMF()->getRegInfo(); + auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg(), MRI); // Renaming registers with multiple disjunct sub-registers (e.g. the // result of a LD3) means that all sub-registers are renamed, potentially // impacting other instructions we did not check. Bail out. @@ -1496,6 +1497,7 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, // loop. FoundDef = IsDef; + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); // For defs, check if we can rename the first def of RegToRename. if (FoundDef) { // For some pseudo instructions, we might not generate code in the end @@ -1518,7 +1520,7 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); return false; } - RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); + RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg(), MRI)); } return true; } else { @@ -1531,7 +1533,7 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); return false; } - RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); + RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg(), MRI)); } } return true; @@ -1577,6 +1579,8 @@ static bool canRenameUntilSecondLoad( return false; } + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + for (auto &MOP : MI.operands()) { if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || !TRI->regsOverlap(MOP.getReg(), RegToRename)) @@ -1585,7 +1589,8 @@ static bool canRenameUntilSecondLoad( LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); return false; } - RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); + RequiredClasses.insert( + TRI->getMinimalPhysRegClass(MOP.getReg(), MRI)); } return true; @@ -1625,7 +1630,7 @@ static std::optional tryToFindRegisterToRename( }); }; - auto *RegClass = TRI->getMinimalPhysRegClass(Reg); + auto *RegClass = TRI->getMinimalPhysRegClass(Reg, RegInfo); for (const MCPhysReg &PR : *RegClass) { if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && @@ -1653,7 +1658,9 @@ static std::optional findRenameRegForSameLdStRegPair( if (!DebugCounter::shouldExecute(RegRenamingCounter)) return RenameReg; - auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + auto *RegClass = + TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg(), MRI); MachineFunction &MF = *FirstMI.getParent()->getParent(); if (!RegClass || !MF.getRegInfo().tracksLiveness()) return RenameReg; diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp index 019b64dd871e2..53159742d14ad 100644 --- a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp @@ -208,7 +208,8 @@ const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( BV.resize(TRI->getNumRegClasses()); for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { auto *RC = TRI->getRegClass(ClassID); - if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) + if (RC->isAllocatable() && MRI->isEnabled(RC) && + TRI->isRegClassAligned(RC, AlignNumBits)) BV.set(ClassID); } } @@ -437,7 +438,7 @@ bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) { LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & " << TRI->getRegClassName(OpDescRC) << " = "); - SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC); + SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC, *MRI); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5ccf21f76015d..782d29e288e1f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16285,7 +16285,8 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const { bool SITargetLowering::checkForPhysRegDependency( SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, - const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { + const MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + unsigned &PhysReg, int &Cost) const { if (User->getOpcode() != ISD::CopyToReg) return false; if (!Def->isMachineOpcode()) @@ -16300,8 +16301,8 @@ bool SITargetLowering::checkForPhysRegDependency( const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { PhysReg = AMDGPU::SCC; - const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + PhysReg, MRI, Def->getSimpleValueType(ResNo)); Cost = RC->getCopyCost(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 89da4428e3ab0..33ea15834fba2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -530,6 +530,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, + const MachineRegisterInfo &MRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f4b21b7dfac39..116616a230366 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1705,7 +1705,7 @@ static unsigned getVectorRegSpillSaveOpcode(Register Reg, bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); // Choose the right opcode if spilling a WWM register. - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + if (RC == &AMDGPU::WWM_VGPR_32RegClass) return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); if (IsVectorSuperClass) @@ -1930,7 +1930,7 @@ getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); // Choose the right opcode if restoring a WWM register. - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + if (RC == &AMDGPU::WWM_VGPR_32RegClass) return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); if (IsVectorSuperClass) @@ -2559,7 +2559,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF)); + RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF), MRI); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -4683,7 +4683,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { const TargetRegisterClass *SubRC = RI.getSubRegisterClass(RC, MO.getSubReg()); - RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); + RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg(), MRI); if (RC) RC = SubRC; } @@ -5665,7 +5665,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, if (!SuperRC) return false; - DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); + DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg(), MRI); if (!DRC) return false; } @@ -8856,9 +8856,8 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, const MachineFunction &MF) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); assert(SrcReg.isVirtual()); - if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + if (MF.getRegInfo().getRegClass(SrcReg) == &AMDGPU::WWM_VGPR_32RegClass) return AMDGPU::WWM_COPY; return AMDGPU::COPY; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4c5978cdc6665..63ebc543d0b27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1387,7 +1387,7 @@ inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, if (!P.SubReg) return RC == &TRC; auto *TRI = MRI.getTargetRegisterInfo(); - return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); + return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg, MRI); } /// \brief Create RegSubRegPair from a register MachineOperand diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 4b13825040ebe..58f0de21c27cb 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -38,6 +39,7 @@ class SILowerSGPRSpills : public MachineFunctionPass { const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; SlotIndexes *Indexes = nullptr; + MachineDominatorTree *MDT = nullptr; // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. @@ -52,11 +54,14 @@ class SILowerSGPRSpills : public MachineFunctionPass { void calculateSaveRestoreBlocks(MachineFunction &MF); bool spillCalleeSavedRegs(MachineFunction &MF, SmallVectorImpl &CalleeSavedFIs); - void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); + void updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr); bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -77,6 +82,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -103,7 +109,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MachineInstrSpan MIS(I, &SaveBlock); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( - Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); + Reg, MRI, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); // If this value was already livein, we probably have a direct use of the // incoming register value, so don't kill at the spill point. This happens @@ -135,6 +141,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *RI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); // Restore all registers immediately before the return and any // terminators that precede it. MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); @@ -144,7 +151,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, for (const CalleeSavedInfo &CI : reverse(CSI)) { Register Reg = CI.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( - Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); + Reg, MRI, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI, Register()); @@ -233,7 +240,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( if (SavedRegs.test(Reg)) { const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, MVT::i32); + TRI->getMinimalPhysRegClass(Reg, MRI, MVT::i32); int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true); @@ -259,48 +266,51 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( return false; } -void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, - LiveIntervals *LIS) { - // TODO: This is a workaround to avoid the unmodelled liveness computed with - // whole-wave virtual registers when allocated together with the regular VGPR - // virtual registers. Presently, the liveness computed during the regalloc is - // only uniform (or single lane aware) and it doesn't take account of the - // divergent control flow that exists for our GPUs. Since the WWM registers - // can modify inactive lanes, the wave-aware liveness should be computed for - // the virtual registers to accurately plot their interferences. Without - // having the divergent CFG for the function, it is difficult to implement the - // wave-aware liveness info. Until then, we conservatively extend the liveness - // of the wwm registers into the entire function so that they won't be reused - // without first spilling/splitting their liveranges. - SIMachineFunctionInfo *MFI = MF.getInfo(); - - // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - for (MachineBasicBlock *SaveBlock : SaveBlocks) { - MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); - auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), Reg); - MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); - // Set SGPR_SPILL asm printer flag - MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); - if (LIS) { - LIS->InsertMachineInstrInMaps(*MIB); +void SILowerSGPRSpills::updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr) { + // For the Def of a virtual LaneVPGR to dominate all its uses, we should + // insert an IMPLICIT_DEF before the dominating spill. Switching to a + // depth first order doesn't really help since the machine function can be in + // the unstructured control flow post-SSA. For each virtual register, hence + // finding the common dominator to get either the dominating spill or a block + // dominating all spills. Is there a better way to handle it? + SIMachineFunctionInfo *FuncInfo = + MBB->getParent()->getInfo(); + ArrayRef VGPRSpills = + FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI); + Register PrevLaneVGPR; + for (auto &Spill : VGPRSpills) { + if (PrevLaneVGPR == Spill.VGPR) + continue; + + PrevLaneVGPR = Spill.VGPR; + auto I = LaneVGPRDomInstr.find(Spill.VGPR); + if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { + // Initially add the spill instruction itself for Insertion point. + LaneVGPRDomInstr[Spill.VGPR] = InsertPt; + } else { + assert(I != LaneVGPRDomInstr.end()); + auto PrevInsertPt = I->second; + MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); + if (DomMBB == MBB) { + // The insertion point earlier selected in a predecessor block whose + // spills are currently being lowered. The earlier InsertPt would be + // the one just before the block terminator and it should be changed + // if we insert any new spill in it. + if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) + I->second = InsertPt; + + continue; } - } - } - // Insert the KILL in the return blocks to extend their liveness untill the - // end of function. Insert a separate KILL for each VGPR. - for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { - MachineBasicBlock::iterator InsertBefore = - RestoreBlock->getFirstTerminator(); - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - auto MIB = - BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(), - TII->get(TargetOpcode::KILL)); - MIB.addReg(Reg); - if (LIS) - LIS->InsertMachineInstrInMaps(*MIB); + // Find the common dominator block between PrevInsertPt and the + // current spill. + DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); + if (DomMBB == MBB) + I->second = InsertPt; + else if (DomMBB != PrevInsertPt->getParent()) + I->second = &(*DomMBB->getFirstTerminator()); } } } @@ -312,6 +322,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable(); Indexes = getAnalysisIfAvailable(); + MDT = &getAnalysis(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -348,6 +359,13 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + // To track the IMPLICIT_DEF insertion point for the lane vgprs. + DenseMap LaneVGPRDomInstr; + + // Enable WWM_VGPR_32 before lowering the spills so that wwm virtual + // registers can be created. + MRI.changeSyntheticInfoForRC(&AMDGPU::WWM_VGPR_32RegClass, /*Value=*/false); + for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (!TII->isSGPRSpill(MI)) @@ -377,6 +395,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { "failed to spill SGPR to physical VGPR lane when allocated"); } } else { + MachineInstrSpan MIS(&MI, &MBB); if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS); @@ -384,18 +403,24 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { llvm_unreachable( "failed to spill SGPR to virtual VGPR lane when allocated"); SpillFIs.set(FI); + updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); SpilledToVirtVGPRLanes = true; } } } } - if (SpilledToVirtVGPRLanes) { - extendWWMVirtRegLiveness(MF, LIS); + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { + auto InsertPt = LaneVGPRDomInstr[Reg]; + // Insert the IMPLICIT_DEF at the identified points. + auto MIB = + BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + // Set SGPR_SPILL asm printer flag + MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); if (LIS) { - // Compute the LiveInterval for the newly created virtual registers. - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - LIS->createAndComputeVirtRegInterval(Reg); + LIS->InsertMachineInstrInMaps(*MIB); + LIS->createAndComputeVirtRegInterval(Reg); } } diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp index 9c3cd1bbd6b0f..26ccd7418d41f 100644 --- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp @@ -55,7 +55,7 @@ class SILowerWWMCopies : public MachineFunctionPass { SlotIndexes *Indexes; VirtRegMap *VRM; const SIRegisterInfo *TRI; - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; SIMachineFunctionInfo *MFI; }; @@ -108,9 +108,6 @@ bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); - if (!MFI->hasVRegFlags()) - return false; - bool Changed = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -137,5 +134,8 @@ bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { } } + // Disable the synthetic regclass WWM_VGPR_32. VGPR allocation is done and we + // no longer needed it. + MRI->changeSyntheticInfoForRC(&AMDGPU::WWM_VGPR_32RegClass, /*Value=*/true); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 2569f40fec0e4..346bb2dbcfa79 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -346,7 +346,7 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::WWM_VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 79a7d1cf66c4d..8454636ad0a43 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2906,13 +2906,12 @@ SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { return SRC; } -const TargetRegisterClass * -SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, - const TargetRegisterClass *SubRC, - unsigned SubIdx) const { +const TargetRegisterClass *SIRegisterInfo::getCompatibleSubRegClass( + const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, + unsigned SubIdx, const MachineRegisterInfo &MRI) const { // Ensure this subregister index is aligned in the super register. const TargetRegisterClass *MatchRC = - getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); + getMatchingSuperRegClass(SuperRC, SubRC, SubIdx, MRI); return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; } @@ -2926,10 +2925,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { } bool SIRegisterInfo::shouldRewriteCopySrc( - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const { + const TargetRegisterClass *DefRC, unsigned DefSubReg, + const TargetRegisterClass *SrcRC, unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const { // We want to prefer the smallest register class possible, so we don't want to // stop and rewrite on anything that looks like a subregister // extract. Operations mostly don't care about the super register class, so we @@ -2946,7 +2944,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc( // => %3 = COPY %0 // Plain copy. - return getCommonSubClass(DefRC, SrcRC) != nullptr; + return getCommonSubClass(DefRC, SrcRC, MRI) != nullptr; } bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { @@ -3121,7 +3119,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); if (const auto *RC = RCOrRB.dyn_cast()) - return getAllocatableClass(RC); + return getAllocatableClass(RC, MRI); return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 88d5686720985..f732682f42e7b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -264,13 +264,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { /// a register tuple), return null. const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, - const TargetRegisterClass *SubRC, - unsigned SubIdx) const; + const TargetRegisterClass *SubRC, unsigned SubIdx, + const MachineRegisterInfo &MRI) const; bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const override; + unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const override; /// \returns True if operands defined with this operand type can accept /// a literal constant (i.e. any 32-bit immediate). diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index cb6591bf62449..9ab48fc114429 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -632,6 +632,16 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1 let Size = 32; let Weight = 1; } + +// Identical to VGPR_32 but can only be used for wwm-registers. This synthetic +// class is disabled by default and will be enabled during regalloc pipeline. +def WWM_VGPR_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32)> { + let AllocationPriority = VGPR_32.AllocationPriority; + let Size = VGPR_32.Size; + let Weight = VGPR_32.Weight; + let BaseClassOrder = VGPR_32.BaseClassOrder; + let Synthetic = 1; +} } // End HasVGPR = 1 // VGPR 64-bit registers diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 9adf758b46c48..a85f4099e81f6 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -933,16 +933,16 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI, return false; } -bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const { +bool ARMBaseRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, unsigned DefSubReg, + const TargetRegisterClass *SrcRC, unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const { // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2). if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 && SrcRC == &ARM::DPRRegClass && (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1)) return false; - return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, - SrcRC, SrcSubReg); + return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, SrcRC, + SrcSubReg, MRI); } diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index 53803cff8b90a..cdc362d7a91f4 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -237,7 +237,8 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo { bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const override; + unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const override; int getSEHRegNum(unsigned i) const { return getEncodingValue(i); } diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index 1c8213b668f71..ace5207488026 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -124,7 +124,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const AVRSubtarget &STI = MF->getSubtarget(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI.getMinimalPhysRegClass(Reg, MF->getRegInfo()); unsigned BytesPerReg = TRI.getRegSizeInBits(*RC) / 8; assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported."); diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp index 64dd0338bf60e..1b79758a01fc3 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -253,6 +253,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( const AVRSubtarget &STI = MF.getSubtarget(); const TargetInstrInfo &TII = *STI.getInstrInfo(); AVRMachineFunctionInfo *AVRFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { Register Reg = I.getReg(); @@ -268,7 +269,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( break; } - assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg, MRI)) == 8 && "Invalid register size"); // Add the callee-saved register as live-in only if it is not already a @@ -301,11 +302,12 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters( const MachineFunction &MF = *MBB.getParent(); const AVRSubtarget &STI = MF.getSubtarget(); const TargetInstrInfo &TII = *STI.getInstrInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const CalleeSavedInfo &CCSI : CSI) { Register Reg = CCSI.getReg(); - assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg, MRI)) == 8 && "Invalid register size"); BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg); diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp index cedcbff1db24f..fc8e090328d82 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp @@ -468,6 +468,7 @@ bool CSKYFrameLowering::spillCalleeSavedRegisters( MachineFunction *MF = MBB.getParent(); const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL; if (MI != MBB.end() && !MI->isDebugInstr()) DL = MI->getDebugLoc(); @@ -475,7 +476,7 @@ bool CSKYFrameLowering::spillCalleeSavedRegisters( for (auto &CS : CSI) { // Insert the spill to the stack frame. Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI, Register()); } @@ -491,13 +492,14 @@ bool CSKYFrameLowering::restoreCalleeSavedRegisters( MachineFunction *MF = MBB.getParent(); const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL; if (MI != MBB.end() && !MI->isDebugInstr()) DL = MI->getDebugLoc(); for (auto &CS : reverse(CSI)) { Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp index 4d5789a3c5fe1..855e64b4b81c2 100644 --- a/llvm/lib/Target/Hexagon/BitTracker.cpp +++ b/llvm/lib/Target/Hexagon/BitTracker.cpp @@ -712,7 +712,7 @@ BT::BitMask BT::MachineEvaluator::mask(Register Reg, unsigned Sub) const { } uint16_t BT::MachineEvaluator::getPhysRegBitWidth(MCRegister Reg) const { - const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg, MRI); return TRI.getRegSizeInBits(PC); } diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index a027f2cedca0a..d44ea12d154b9 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -120,7 +120,8 @@ uint16_t HexagonEvaluator::getPhysRegBitWidth(MCRegister Reg) const { return TRI.getRegSizeInBits(RC); } // Default treatment for other physical registers. - if (const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg)) + if (const TargetRegisterClass *RC = + TRI.getMinimalPhysRegClass(Reg, MF.getRegInfo())) return TRI.getRegSizeInBits(*RC); llvm_unreachable( diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp index e1005296d6375..e71cc55766a5f 100644 --- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -612,7 +612,7 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO, PhysR = RS.Reg; } MCRegister PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS, *MRI); switch (TRI->getRegSizeInBits(*RC)) { case 32: return IfTrue ? Hexagon::A2_tfrt : Hexagon::A2_tfrf; diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 232651132d6e4..b1ac44a06418c 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1411,6 +1411,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, return true; } + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); // Add live in registers. We treat eh_return callee saved register r0 - r3 @@ -1418,7 +1419,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, // supposed to be killed. bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); int FI = I.getFrameIdx(); - const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg, MRI); HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register()); if (IsKill) MBB.addLiveIn(Reg); @@ -1480,9 +1481,10 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, return true; } + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); - const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg, MRI); int FI = I.getFrameIdx(); HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register()); } @@ -1560,6 +1562,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { LLVM_DEBUG(dbgs() << __func__ << " on " << MF.getName() << '\n'); MachineFrameInfo &MFI = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector SRegs(Hexagon::NUM_TARGET_REGS); // Generate a set of unique, callee-saved registers (SRegs), where each @@ -1665,7 +1668,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) { if (!SRegs[S->Reg]) continue; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg, MRI); int FI = MFI.CreateFixedSpillStackObject(TRI->getSpillSize(*RC), S->Offset); MinOffset = std::min(MinOffset, S->Offset); CSI.push_back(CalleeSavedInfo(S->Reg, FI)); @@ -1677,7 +1680,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, // such register, create a non-fixed stack object. for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) { Register R = x; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R, MRI); unsigned Size = TRI->getSpillSize(*RC); int Off = MinOffset - Size; Align Alignment = std::min(TRI->getSpillAlign(*RC), getStackAlign()); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index b9bf26ba7cca1..20bd79352bae1 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1730,12 +1730,14 @@ bool HexagonInstrInfo::ClobbersPredicate(MachineInstr &MI, std::vector &Pred, bool SkipDead) const { const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo(); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (const MachineOperand &MO : MI.operands()) { if (MO.isReg()) { if (!MO.isDef()) continue; - const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg()); + const TargetRegisterClass *RC = + HRI.getMinimalPhysRegClass(MO.getReg(), MRI); if (RC == &Hexagon::PredRegsRegClass) { Pred.push_back(MO); return true; diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 56472d633694a..7c21a12f00db5 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -705,13 +705,14 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI, unsigned predRegNumSrc = 0; unsigned predRegNumDst = 0; const TargetRegisterClass* predRegClass = nullptr; + const MachineRegisterInfo &MRI = PacketMI.getMF()->getRegInfo(); // Get predicate register used in the source instruction. for (auto &MO : PacketMI.operands()) { if (!MO.isReg()) continue; predRegNumSrc = MO.getReg(); - predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc, MRI); if (predRegClass == &Hexagon::PredRegsRegClass) break; } @@ -723,7 +724,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI, if (!MO.isReg()) continue; predRegNumDst = MO.getReg(); - predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst, MRI); if (predRegClass == &Hexagon::PredRegsRegClass) break; } @@ -1406,6 +1407,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { if (!SUJ->isSucc(SUI)) return true; + const MachineRegisterInfo &MRI = I.getMF()->getRegInfo(); + for (unsigned i = 0; i < SUJ->Succs.size(); ++i) { if (FoundSequentialDependence) break; @@ -1433,7 +1436,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { const TargetRegisterClass *RC = nullptr; if (DepType == SDep::Data) { DepReg = SUJ->Succs[i].getReg(); - RC = HRI->getMinimalPhysRegClass(DepReg); + RC = HRI->getMinimalPhysRegClass(DepReg, MRI); } if (I.isCall() || HII->isJumpR(I) || I.isReturn() || HII->isTailCall(I)) { diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index b26821cd01718..1f2d2ca88472f 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -48,8 +48,9 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) { assert(Register::isPhysicalRegister(DstR.Reg)); assert(Register::isPhysicalRegister(SrcR.Reg)); const TargetRegisterInfo &TRI = DFG.getTRI(); - if (TRI.getMinimalPhysRegClass(DstR.Reg) != - TRI.getMinimalPhysRegClass(SrcR.Reg)) + const MachineRegisterInfo &MRI = DFG.getMF().getRegInfo(); + if (TRI.getMinimalPhysRegClass(DstR.Reg, MRI) != + TRI.getMinimalPhysRegClass(SrcR.Reg, MRI)) return false; if (!DFG.isTracked(SrcR) || !DFG.isTracked(DstR)) return false; @@ -158,7 +159,8 @@ bool CopyPropagation::run() { auto MinPhysReg = [this] (RegisterRef RR) -> unsigned { const TargetRegisterInfo &TRI = DFG.getTRI(); - const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg); + const TargetRegisterClass &RC = + *TRI.getMinimalPhysRegClass(RR.Reg, DFG.getMF().getRegInfo()); if ((RC.LaneMask & RR.Mask) == RC.LaneMask) return RR.Reg; for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S) diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index dc2d61a6e4740..063ae2490417e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -463,7 +463,8 @@ bool LoongArchFrameLowering::spillCalleeSavedRegisters( // LoongArchTargetLowering::lowerRETURNADDR, don't set kill flag. bool IsKill = !(Reg == LoongArch::R1 && MF->getFrameInfo().isReturnAddressTaken()); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MF->getRegInfo()); TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, TRI, Register()); } diff --git a/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/llvm/lib/Target/Mips/MipsFrameLowering.cpp index 99d225f9abfe8..b152d70f7970b 100644 --- a/llvm/lib/Target/Mips/MipsFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsFrameLowering.cpp @@ -114,6 +114,7 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const { uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); int64_t Size = 0; @@ -124,7 +125,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { // Conservatively assume all callee-saved registers will be saved. for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) { - unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); + unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R, MRI)); Size = alignTo(Size + RegSize, RegSize); } diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index 38f6889a52358..a923d21de8fcf 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -259,7 +259,8 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, // copy dst_hi, $vr1 unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg(); - const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst); + const TargetRegisterClass *DstRC = + RegInfo.getMinimalPhysRegClass(Dst, MBB.getParent()->getRegInfo()); unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16; const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize); Register VR0 = MRI.createVirtualRegister(RC); @@ -796,6 +797,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( ArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); const TargetInstrInfo &TII = *STI.getInstrInfo(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); for (const CalleeSavedInfo &I : CSI) { // Add the callee-saved register as live-in. Do not add if the register is @@ -831,7 +833,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI, Register()); } diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 6dcb59a3a57f8..e697920f94854 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2502,7 +2502,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( } Spilled.set(Dst); } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); // Use !IsLiveIn for the kill flag. // We do not want to kill registers that are live in this function // before their use because they will become undefined registers. @@ -2605,6 +2605,7 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( bool CR4Spilled = false; unsigned CSIIndex = 0; BitVector Restored(TRI->getNumRegs()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); // Initialize insertion-point logic; we will be restoring in reverse // order of spill. @@ -2677,7 +2678,7 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( } else { // Default behavior for non-CR saves. - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); // Functions without NoUnwind need to preserve the order of elements in // saved vector registers. diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 5f5eb31a5a85f..fe8df2a601233 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1526,8 +1526,8 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + const TargetRegisterClass *RC = RI.getCommonSubClass( + MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg), MRI); if (!RC) return false; @@ -1559,8 +1559,8 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, // Get the register classes. MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + const TargetRegisterClass *RC = RI.getCommonSubClass( + MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg), MRI); assert(RC && "TrueReg and FalseReg must have overlapping register classes"); bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) || diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 0f450a4bf9692..74073f2f07f48 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -456,6 +456,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co const PPCInstrInfo *InstrInfo = Subtarget.getInstrInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); const std::vector &Info = MFI.getCalleeSavedInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); LLVM_DEBUG(dbgs() << "requiresFrameIndexScavenging for " << MF.getName() << ".\n"); @@ -486,7 +487,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co int FrIdx = CSI.getFrameIdx(); Register Reg = CSI.getReg(); - const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg, MRI); unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(RC); if (!MFI.isFixedObjectIndex(FrIdx)) { // This is not a fixed object. If it requires alignment then we may still diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 8bac41372b5a8..8010da5dc5634 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1391,10 +1391,11 @@ bool RISCVFrameLowering::assignCalleeSavedSpillSlots( MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto &CS : CSI) { unsigned Reg = CS.getReg(); - const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg, MRI); unsigned Size = RegInfo->getSpillSize(*RC); // This might need a fixed stack slot. @@ -1483,10 +1484,11 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( // Manually spill values not spilled by libcall & Push/Pop. const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI); + const MachineRegisterInfo &MRI = MF->getRegInfo(); for (auto &CS : UnmanagedCSI) { // Insert the spill to the stack frame. Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(), RC, TRI, Register()); } @@ -1513,9 +1515,10 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( // load-to-use data hazard between loading RA and return by RA. // loadRegFromStackSlot can insert multiple instructions. const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI); + const MachineRegisterInfo &MRI = MF->getRegInfo(); for (auto &CS : UnmanagedCSI) { Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 10bf1e88d7414..f6bf254cd81c5 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -555,7 +555,7 @@ bool RISCVRegisterInfo::needsFrameBaseReg(MachineInstr *MI, BitVector ReservedRegs = getReservedRegs(MF); for (const MCPhysReg *R = MRI.getCalleeSavedRegs(); MCPhysReg Reg = *R; ++R) { if (!ReservedRegs.test(Reg)) - CalleeSavedSize += getSpillSize(*getMinimalPhysRegClass(Reg)); + CalleeSavedSize += getSpillSize(*getMinimalPhysRegClass(Reg, MRI)); } int64_t MaxFPOffset = Offset - CalleeSavedSize; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 80c994a32ea96..a1b023618aa36 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -185,6 +185,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots( unsigned LowGPR = 0; unsigned HighGPR = SystemZ::R15D; int StartSPOffset = SystemZMC::ELFCallFrameSize; + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto &CS : CSI) { Register Reg = CS.getReg(); int Offset = getRegSpillOffset(MF, Reg); @@ -227,7 +228,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots( if (CS.getFrameIdx() != INT32_MAX) continue; Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); unsigned Size = TRI->getSpillSize(*RC); CurrOffset -= Size; assert(CurrOffset % 8 == 0 && @@ -1009,6 +1010,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( int LowSpillOffset = INT32_MAX; Register HighGPR = 0; int HighOffset = -1; + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto &CS : CSI) { Register Reg = CS.getReg(); @@ -1038,7 +1040,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( } } else { Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); Align Alignment = TRI->getSpillAlign(*RC); unsigned Size = TRI->getSpillSize(*RC); Alignment = std::min(Alignment, getStackAlign()); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2a6dce863c28f..d4dae31f888be 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -548,8 +548,8 @@ bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + const TargetRegisterClass *RC = RI.getCommonSubClass( + MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg), MRI); if (!RC) return false; diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp index d246d3f3c5bd1..2e6bc82d4849e 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -146,12 +146,11 @@ bool SystemZRegisterInfo::getRegAllocationHints( Use.getOpcode() == SystemZ::SELRMux) { MachineOperand &TrueMO = Use.getOperand(1); MachineOperand &FalseMO = Use.getOperand(2); - const TargetRegisterClass *RC = - TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI), - getRC32(TrueMO, VRM, MRI)); + const TargetRegisterClass *RC = TRI->getCommonSubClass( + getRC32(FalseMO, VRM, MRI), getRC32(TrueMO, VRM, MRI), *MRI); if (Use.getOpcode() == SystemZ::SELRMux) - RC = TRI->getCommonSubClass(RC, - getRC32(Use.getOperand(0), VRM, MRI)); + RC = TRI->getCommonSubClass( + RC, getRC32(Use.getOperand(0), VRM, MRI), *MRI); if (RC && RC != &SystemZ::GRX32BitRegClass) { addHints(Order, Hints, RC, MRI); // Return true to make these hints the only regs available to diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 32a4accd040eb..614ecca890827 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -64,7 +64,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *RC = Register::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) - : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg); + : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg, MRI); unsigned CopyOpcode = WebAssembly::getCopyOpcodeForRegClass(RC); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp index 1e2bee7a5c73b..51a1997a52d36 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp @@ -80,7 +80,7 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) { continue; // Replace explicit uses of the physical register with a virtual register. - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg, MRI); unsigned VReg = WebAssembly::NoRegister; for (MachineOperand &MO : llvm::make_early_inc_range(MRI.reg_operands(PReg))) { diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index d914e1b61ab07..0e0e30877fe1d 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2882,6 +2882,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( } } + const MachineRegisterInfo &MRI = MF.getRegInfo(); // Assign slots for GPRs. It increases frame size. for (CalleeSavedInfo &I : llvm::reverse(CSI)) { Register Reg = I.getReg(); @@ -2931,7 +2932,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (X86::VK16RegClass.contains(Reg)) VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI, VT); unsigned Size = TRI->getSpillSize(*RC); Align Alignment = TRI->getSpillAlign(*RC); // ensure alignment @@ -3018,6 +3019,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( .setMIFlag(MachineInstr::FrameSetup); } + const MachineRegisterInfo &MRI = MF.getRegInfo(); // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { @@ -3032,7 +3034,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI, VT); TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, Register()); @@ -3097,6 +3099,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( } DebugLoc DL = MBB.findDebugLoc(MI); + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Reload XMMs from stack frame. for (const CalleeSavedInfo &I : CSI) { @@ -3109,7 +3112,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( if (X86::VK16RegClass.contains(Reg)) VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI, VT); TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI, Register()); } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index eb42a4b2119d5..dc2b6514dcfcd 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4028,8 +4028,8 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + const TargetRegisterClass *RC = RI.getCommonSubClass( + MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg), MRI); if (!RC) return false; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index be0cf1596d0d9..32c4ee88459d9 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -93,17 +93,16 @@ X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); } -const TargetRegisterClass * -X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, - const TargetRegisterClass *B, - unsigned SubIdx) const { +const TargetRegisterClass *X86RegisterInfo::getMatchingSuperRegClass( + const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned SubIdx, + const MachineRegisterInfo &MRI) const { // The sub_8bit sub-register index is more constrained in 32-bit mode. if (!Is64Bit && SubIdx == X86::sub_8bit) { A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); if (!A) return nullptr; } - return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); + return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx, MRI); } const TargetRegisterClass * @@ -218,10 +217,10 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, } } -bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const { +bool X86RegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, unsigned DefSubReg, + const TargetRegisterClass *SrcRC, unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const { // Prevent rewriting a copy where the destination size is larger than the // input size. See PR41619. // FIXME: Should this be factored into the base implementation somehow. @@ -229,8 +228,8 @@ bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit) return false; - return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, - SrcRC, SrcSubReg); + return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, SrcRC, + SrcSubReg, MRI); } const TargetRegisterClass * diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 7296a5f021e4a..e61defefbdf3f 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -62,8 +62,8 @@ class X86RegisterInfo final : public X86GenRegisterInfo { /// specified sub-register index which is in the specified register class B. const TargetRegisterClass * getMatchingSuperRegClass(const TargetRegisterClass *A, - const TargetRegisterClass *B, - unsigned Idx) const override; + const TargetRegisterClass *B, unsigned Idx, + const MachineRegisterInfo &MRI) const override; const TargetRegisterClass * getSubClassWithSubReg(const TargetRegisterClass *RC, @@ -76,7 +76,8 @@ class X86RegisterInfo final : public X86GenRegisterInfo { bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const override; + unsigned SrcSubReg, + const MachineRegisterInfo &MRI) const override; /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index 8cb9413f96526..ea05ea2078c7b 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -422,6 +422,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); XCoreFunctionInfo *XFI = MF->getInfo(); bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF); + const MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL; if (MI != MBB.end() && !MI->isDebugInstr()) @@ -434,7 +435,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, Register()); if (emitFrameMoves) { @@ -453,6 +454,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters( const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); bool AtStart = MI == MBB.begin(); MachineBasicBlock::iterator BeforeI = MI; + const MachineRegisterInfo &MRI = MF->getRegInfo(); if (!AtStart) --BeforeI; for (const CalleeSavedInfo &CSR : CSI) { @@ -460,7 +462,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters( assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) && "LR & FP are always handled in emitEpilogue"); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MRI); TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI, Register()); assert(MI != MBB.begin() && diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 10cbc56cc5fbe..f70a4c27a1d7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -15,49 +15,43 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; CHECK-NEXT: v_mov_b32_e32 v8, v0 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v9, v7 -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v2, v15 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v4, v13 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v6, v11 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v14, v1 +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v13, v2 +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v12, v3 +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v11, v4 +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v10, v5 +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v9, v6 +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v1, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v0, s4, 0 ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: v_writelane_b32 v0, s6, 2 @@ -177,15 +171,11 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_readlane_b32 s4, v0, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: ; kill: killed $vgpr4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index 326df0750cfbd..ace653d6dc029 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -106,7 +106,7 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %7 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 3538954 /* regdef:VReg_64 */, def %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 3604490 /* regdef:VReg_64 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %8 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42) ret void @@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %7 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %9 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %11, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %11, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -306,7 +306,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %7 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll index 3ed2cb856eaea..8d2274fd329f4 100644 --- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll @@ -8,19 +8,17 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; REGALLOC-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; REGALLOC-NEXT: {{ $}} - ; REGALLOC-NEXT: renamable $vgpr3 = IMPLICIT_DEF ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; REGALLOC-NEXT: renamable $vgpr1 = COPY $vgpr0 - ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 49 - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec + ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec ; REGALLOC-NEXT: renamable $sgpr6 = IMPLICIT_DEF - ; REGALLOC-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr6 - ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; REGALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr6 + ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc + ; REGALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7 ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) @@ -66,11 +64,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: bb.4.bb.3: ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 - ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 3 ; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) ; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec - ; REGALLOC-NEXT: KILL killed renamable $vgpr1 ; REGALLOC-NEXT: SI_RETURN implicit killed $vgpr0 bb.0: %cmp = icmp slt i32 %arg0, 50 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 3c8ea61b0d43b..afab70aafe49f 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -36,8 +36,8 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[2:3], 0xa @@ -52,11 +52,6 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -91,11 +86,6 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm entry: %cmp = icmp eq i32 %n, -1 @@ -144,8 +134,8 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 @@ -155,11 +145,6 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -232,8 +217,8 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 @@ -243,11 +228,6 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -321,8 +301,8 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 @@ -332,11 +312,6 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -422,8 +397,8 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 @@ -445,11 +420,6 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 6bc8d29b3bf7c..8e3b958f09df9 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -48,13 +48,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 @@ -145,11 +142,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -222,13 +218,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 @@ -340,14 +333,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -433,15 +422,11 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 @@ -577,11 +562,10 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -681,47 +665,43 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0 ; GCN-O0-NEXT: s_mov_b32 s1, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v1 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 ; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v4 -; GCN-O0-NEXT: v_add_i32_e64 v5, s[2:3], s2, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] -; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3] +; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v5, v1 +; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b32 s3, s1 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 -; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 +; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -867,11 +847,10 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v1 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -935,13 +914,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s9 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 @@ -990,7 +966,6 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_barrier -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1085,16 +1060,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 @@ -1359,41 +1329,38 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v3 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 2f3d5d9d140c2..c034b54ba0b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -224,102 +224,98 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr9_vgpr10 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v4, v5 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v4, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v6, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v4, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v8, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v0, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: v_xor_b32_e64 v11, v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v3, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v3, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v0, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec @@ -327,11 +323,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v5 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -354,13 +350,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v8, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] @@ -1162,58 +1159,54 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 @@ -1414,25 +1407,25 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec @@ -1471,13 +1464,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v8, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] @@ -1674,27 +1668,27 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(6) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -1709,22 +1703,22 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_9 ; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -1764,27 +1758,27 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1794,30 +1788,30 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] @@ -1957,24 +1951,24 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 ; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 @@ -1984,42 +1978,42 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2103,12 +2097,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 @@ -2119,30 +2113,30 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -2184,14 +2178,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] @@ -2237,12 +2231,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 @@ -2257,18 +2251,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -2281,28 +2275,24 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[7:8] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir index 3bf7e7b8c5696..04ddebd3aa414 100644 --- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir +++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir @@ -28,10 +28,9 @@ body: | ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr0, 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 @@ -63,32 +62,31 @@ body: | ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 20, implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $vgpr0, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr6, $sgpr10_sgpr11 @@ -135,52 +133,51 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 20, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 5, implicit $exec ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: liveins: $vgpr0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr0, implicit $exec ; GCN-NEXT: $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN-NEXT: S_CBRANCH_SCC1 %bb.5, implicit $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_SUB_U32_e32 1, killed $vgpr0, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 killed $vgpr0, implicit $exec ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr6_sgpr7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr4, $sgpr10_sgpr11 @@ -239,26 +236,24 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr0, 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: liveins: $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: liveins: $vgpr2_vgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr0, 0, 0, implicit $exec ; GCN-NEXT: SI_RETURN bb.0: liveins: $sgpr4, $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index b9583a73295e2..663f682fd8858 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -228,11 +228,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 +; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: test: @@ -351,11 +348,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 +; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm call void asm sideeffect "", "~{s[0:7]}" () call void asm sideeffect "", "~{s[8:15]}" () diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index ba619a659f1b0..f8c3629764ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,9 +12,9 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo + ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 @@ -37,9 +37,9 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi + ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 @@ -62,9 +62,9 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $exec + ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 @@ -93,8 +93,8 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 @@ -116,8 +116,8 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 @@ -139,8 +139,8 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir index 1c7896fcb4f14..5732e43b3c423 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,9 +13,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: S_NOP 0, implicit-def $m0 + ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 297b5180dfe9b..04fc4218d2964 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v8, s30, 0 -; CHECK-NEXT: v_writelane_b32 v8, s31, 1 -; CHECK-NEXT: v_writelane_b32 v8, s36, 2 -; CHECK-NEXT: v_writelane_b32 v8, s37, 3 -; CHECK-NEXT: v_writelane_b32 v8, s38, 4 -; CHECK-NEXT: v_writelane_b32 v8, s39, 5 -; CHECK-NEXT: v_writelane_b32 v8, s40, 6 -; CHECK-NEXT: v_writelane_b32 v8, s41, 7 -; CHECK-NEXT: v_writelane_b32 v8, s42, 8 -; CHECK-NEXT: v_writelane_b32 v8, s43, 9 -; CHECK-NEXT: v_writelane_b32 v8, s44, 10 -; CHECK-NEXT: v_writelane_b32 v8, s45, 11 -; CHECK-NEXT: v_writelane_b32 v8, s46, 12 -; CHECK-NEXT: v_writelane_b32 v8, s47, 13 -; CHECK-NEXT: v_writelane_b32 v8, s48, 14 -; CHECK-NEXT: v_writelane_b32 v8, s49, 15 +; CHECK-NEXT: v_writelane_b32 v7, s30, 0 +; CHECK-NEXT: v_writelane_b32 v7, s31, 1 +; CHECK-NEXT: v_writelane_b32 v7, s36, 2 +; CHECK-NEXT: v_writelane_b32 v7, s37, 3 +; CHECK-NEXT: v_writelane_b32 v7, s38, 4 +; CHECK-NEXT: v_writelane_b32 v7, s39, 5 +; CHECK-NEXT: v_writelane_b32 v7, s40, 6 +; CHECK-NEXT: v_writelane_b32 v7, s41, 7 +; CHECK-NEXT: v_writelane_b32 v7, s42, 8 +; CHECK-NEXT: v_writelane_b32 v7, s43, 9 +; CHECK-NEXT: v_writelane_b32 v7, s44, 10 +; CHECK-NEXT: v_writelane_b32 v7, s45, 11 +; CHECK-NEXT: v_writelane_b32 v7, s46, 12 +; CHECK-NEXT: v_writelane_b32 v7, s47, 13 +; CHECK-NEXT: v_writelane_b32 v7, s48, 14 +; CHECK-NEXT: v_writelane_b32 v7, s49, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v8, s50, 16 +; CHECK-NEXT: v_writelane_b32 v7, s50, 16 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s51, 17 +; CHECK-NEXT: v_writelane_b32 v7, s51, 17 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s4, 0x130 ; CHECK-NEXT: s_mov_b32 s5, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s36, 0 -; CHECK-NEXT: v_writelane_b32 v4, s37, 1 -; CHECK-NEXT: v_writelane_b32 v4, s38, 2 -; CHECK-NEXT: v_writelane_b32 v4, s39, 3 -; CHECK-NEXT: v_writelane_b32 v4, s40, 4 -; CHECK-NEXT: v_writelane_b32 v4, s41, 5 -; CHECK-NEXT: v_writelane_b32 v4, s42, 6 -; CHECK-NEXT: v_writelane_b32 v4, s43, 7 -; CHECK-NEXT: v_writelane_b32 v4, s44, 8 -; CHECK-NEXT: v_writelane_b32 v4, s45, 9 -; CHECK-NEXT: v_writelane_b32 v4, s46, 10 +; CHECK-NEXT: v_writelane_b32 v3, s36, 0 +; CHECK-NEXT: v_writelane_b32 v3, s37, 1 +; CHECK-NEXT: v_writelane_b32 v3, s38, 2 +; CHECK-NEXT: v_writelane_b32 v3, s39, 3 +; CHECK-NEXT: v_writelane_b32 v3, s40, 4 +; CHECK-NEXT: v_writelane_b32 v3, s41, 5 +; CHECK-NEXT: v_writelane_b32 v3, s42, 6 +; CHECK-NEXT: v_writelane_b32 v3, s43, 7 +; CHECK-NEXT: v_writelane_b32 v3, s44, 8 +; CHECK-NEXT: v_writelane_b32 v3, s45, 9 +; CHECK-NEXT: v_writelane_b32 v3, s46, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v4, s47, 11 -; CHECK-NEXT: v_writelane_b32 v4, s48, 12 -; CHECK-NEXT: v_writelane_b32 v4, s49, 13 +; CHECK-NEXT: v_writelane_b32 v3, s47, 11 +; CHECK-NEXT: v_writelane_b32 v3, s48, 12 +; CHECK-NEXT: v_writelane_b32 v3, s49, 13 ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v4, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v5, s28 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_writelane_b32 v3, s50, 14 +; CHECK-NEXT: v_mov_b32_e32 v4, s28 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v4, s51, 15 +; CHECK-NEXT: v_writelane_b32 v3, s51, 15 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[4:5], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 16 -; CHECK-NEXT: v_writelane_b32 v4, s5, 17 -; CHECK-NEXT: v_writelane_b32 v4, s6, 18 -; CHECK-NEXT: v_writelane_b32 v4, s7, 19 -; CHECK-NEXT: v_writelane_b32 v4, s8, 20 -; CHECK-NEXT: v_writelane_b32 v4, s9, 21 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v4, s10, 22 -; CHECK-NEXT: v_writelane_b32 v4, s11, 23 -; CHECK-NEXT: v_writelane_b32 v4, s12, 24 -; CHECK-NEXT: v_writelane_b32 v4, s13, 25 -; CHECK-NEXT: v_writelane_b32 v4, s14, 26 -; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 -; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 -; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 -; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 +; CHECK-NEXT: v_writelane_b32 v3, s4, 16 +; CHECK-NEXT: v_writelane_b32 v3, s5, 17 +; CHECK-NEXT: v_writelane_b32 v3, s6, 18 +; CHECK-NEXT: v_writelane_b32 v3, s7, 19 +; CHECK-NEXT: v_writelane_b32 v3, s8, 20 +; CHECK-NEXT: v_writelane_b32 v3, s9, 21 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v3, s10, 22 +; CHECK-NEXT: v_writelane_b32 v3, s11, 23 +; CHECK-NEXT: v_writelane_b32 v3, s12, 24 +; CHECK-NEXT: v_writelane_b32 v3, s13, 25 +; CHECK-NEXT: v_writelane_b32 v3, s14, 26 +; CHECK-NEXT: v_writelane_b32 v3, s15, 27 +; CHECK-NEXT: v_writelane_b32 v3, s16, 28 +; CHECK-NEXT: v_writelane_b32 v7, s52, 18 +; CHECK-NEXT: v_writelane_b32 v3, s17, 29 +; CHECK-NEXT: v_writelane_b32 v7, s53, 19 +; CHECK-NEXT: v_writelane_b32 v3, s18, 30 +; CHECK-NEXT: v_writelane_b32 v7, s54, 20 +; CHECK-NEXT: v_writelane_b32 v3, s19, 31 ; CHECK-NEXT: s_mov_b32 s4, 48 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s55, 21 +; CHECK-NEXT: v_writelane_b32 v7, s55, 21 ; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v8, s56, 22 -; CHECK-NEXT: v_writelane_b32 v8, s57, 23 -; CHECK-NEXT: v_writelane_b32 v8, s58, 24 -; CHECK-NEXT: v_writelane_b32 v8, s59, 25 -; CHECK-NEXT: v_writelane_b32 v8, s60, 26 +; CHECK-NEXT: v_writelane_b32 v7, s56, 22 +; CHECK-NEXT: v_writelane_b32 v7, s57, 23 +; CHECK-NEXT: v_writelane_b32 v7, s58, 24 +; CHECK-NEXT: v_writelane_b32 v7, s59, 25 +; CHECK-NEXT: v_writelane_b32 v7, s60, 26 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 32 -; CHECK-NEXT: v_writelane_b32 v8, s61, 27 -; CHECK-NEXT: v_writelane_b32 v4, s5, 33 -; CHECK-NEXT: v_writelane_b32 v8, s62, 28 -; CHECK-NEXT: v_writelane_b32 v4, s6, 34 -; CHECK-NEXT: v_writelane_b32 v8, s63, 29 -; CHECK-NEXT: v_writelane_b32 v4, s7, 35 -; CHECK-NEXT: v_writelane_b32 v8, s64, 30 -; CHECK-NEXT: v_writelane_b32 v4, s8, 36 -; CHECK-NEXT: v_writelane_b32 v8, s65, 31 -; CHECK-NEXT: v_writelane_b32 v4, s9, 37 -; CHECK-NEXT: v_writelane_b32 v8, s66, 32 +; CHECK-NEXT: v_writelane_b32 v3, s4, 32 +; CHECK-NEXT: v_writelane_b32 v7, s61, 27 +; CHECK-NEXT: v_writelane_b32 v3, s5, 33 +; CHECK-NEXT: v_writelane_b32 v7, s62, 28 +; CHECK-NEXT: v_writelane_b32 v3, s6, 34 +; CHECK-NEXT: v_writelane_b32 v7, s63, 29 +; CHECK-NEXT: v_writelane_b32 v3, s7, 35 +; CHECK-NEXT: v_writelane_b32 v7, s64, 30 +; CHECK-NEXT: v_writelane_b32 v3, s8, 36 +; CHECK-NEXT: v_writelane_b32 v7, s65, 31 +; CHECK-NEXT: v_writelane_b32 v3, s9, 37 +; CHECK-NEXT: v_writelane_b32 v7, s66, 32 ; CHECK-NEXT: s_movk_i32 s26, 0x1f0 ; CHECK-NEXT: s_movk_i32 s28, 0x2f0 ; CHECK-NEXT: s_mov_b32 s27, s24 ; CHECK-NEXT: s_mov_b32 s29, s24 -; CHECK-NEXT: v_writelane_b32 v4, s10, 38 -; CHECK-NEXT: v_writelane_b32 v8, s67, 33 -; CHECK-NEXT: v_writelane_b32 v4, s11, 39 +; CHECK-NEXT: v_writelane_b32 v3, s10, 38 +; CHECK-NEXT: v_writelane_b32 v7, s67, 33 +; CHECK-NEXT: v_writelane_b32 v3, s11, 39 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 +; CHECK-NEXT: v_mul_f32_e32 v0, v5, v4 ; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] ; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 32 -; CHECK-NEXT: v_readlane_b32 s40, v4, 36 -; CHECK-NEXT: v_readlane_b32 s41, v4, 37 -; CHECK-NEXT: v_readlane_b32 s42, v4, 38 -; CHECK-NEXT: v_readlane_b32 s43, v4, 39 +; CHECK-NEXT: v_readlane_b32 s36, v3, 32 +; CHECK-NEXT: v_readlane_b32 s40, v3, 36 +; CHECK-NEXT: v_readlane_b32 s41, v3, 37 +; CHECK-NEXT: v_readlane_b32 s42, v3, 38 +; CHECK-NEXT: v_readlane_b32 s43, v3, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 +; CHECK-NEXT: v_readlane_b32 s37, v3, 33 +; CHECK-NEXT: v_readlane_b32 s38, v3, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[60:67], s[40:43] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s39, v3, 35 ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v5 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v4 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v4, 32 -; CHECK-NEXT: v_readlane_b32 s13, v4, 33 -; CHECK-NEXT: v_readlane_b32 s14, v4, 34 -; CHECK-NEXT: v_readlane_b32 s15, v4, 35 -; CHECK-NEXT: v_readlane_b32 s16, v4, 36 -; CHECK-NEXT: v_readlane_b32 s17, v4, 37 -; CHECK-NEXT: v_readlane_b32 s18, v4, 38 -; CHECK-NEXT: v_readlane_b32 s19, v4, 39 -; CHECK-NEXT: v_writelane_b32 v4, s4, 40 -; CHECK-NEXT: v_writelane_b32 v4, s5, 41 -; CHECK-NEXT: v_writelane_b32 v4, s6, 42 -; CHECK-NEXT: v_writelane_b32 v4, s7, 43 -; CHECK-NEXT: v_writelane_b32 v4, s8, 44 -; CHECK-NEXT: v_writelane_b32 v4, s9, 45 -; CHECK-NEXT: v_writelane_b32 v4, s10, 46 -; CHECK-NEXT: v_writelane_b32 v4, s11, 47 -; CHECK-NEXT: v_writelane_b32 v4, s12, 48 -; CHECK-NEXT: v_writelane_b32 v4, s13, 49 -; CHECK-NEXT: v_writelane_b32 v4, s14, 50 -; CHECK-NEXT: v_writelane_b32 v4, s15, 51 -; CHECK-NEXT: v_writelane_b32 v4, s16, 52 -; CHECK-NEXT: v_writelane_b32 v4, s17, 53 -; CHECK-NEXT: v_writelane_b32 v4, s18, 54 -; CHECK-NEXT: v_writelane_b32 v4, s19, 55 -; CHECK-NEXT: v_writelane_b32 v4, s52, 56 -; CHECK-NEXT: v_writelane_b32 v3, s60, 0 -; CHECK-NEXT: v_writelane_b32 v4, s53, 57 -; CHECK-NEXT: v_writelane_b32 v3, s61, 1 -; CHECK-NEXT: v_writelane_b32 v4, s54, 58 -; CHECK-NEXT: v_writelane_b32 v3, s62, 2 -; CHECK-NEXT: v_writelane_b32 v4, s55, 59 -; CHECK-NEXT: v_writelane_b32 v3, s63, 3 -; CHECK-NEXT: v_writelane_b32 v4, s56, 60 -; CHECK-NEXT: v_writelane_b32 v3, s64, 4 -; CHECK-NEXT: v_writelane_b32 v4, s57, 61 -; CHECK-NEXT: v_writelane_b32 v3, s65, 5 -; CHECK-NEXT: v_writelane_b32 v4, s58, 62 -; CHECK-NEXT: v_writelane_b32 v3, s66, 6 -; CHECK-NEXT: v_writelane_b32 v4, s59, 63 -; CHECK-NEXT: v_writelane_b32 v3, s67, 7 +; CHECK-NEXT: v_readlane_b32 s12, v3, 32 +; CHECK-NEXT: v_readlane_b32 s13, v3, 33 +; CHECK-NEXT: v_readlane_b32 s14, v3, 34 +; CHECK-NEXT: v_readlane_b32 s15, v3, 35 +; CHECK-NEXT: v_readlane_b32 s16, v3, 36 +; CHECK-NEXT: v_readlane_b32 s17, v3, 37 +; CHECK-NEXT: v_readlane_b32 s18, v3, 38 +; CHECK-NEXT: v_readlane_b32 s19, v3, 39 +; CHECK-NEXT: v_writelane_b32 v3, s4, 40 +; CHECK-NEXT: v_writelane_b32 v3, s5, 41 +; CHECK-NEXT: v_writelane_b32 v3, s6, 42 +; CHECK-NEXT: v_writelane_b32 v3, s7, 43 +; CHECK-NEXT: v_writelane_b32 v3, s8, 44 +; CHECK-NEXT: v_writelane_b32 v3, s9, 45 +; CHECK-NEXT: v_writelane_b32 v3, s10, 46 +; CHECK-NEXT: v_writelane_b32 v3, s11, 47 +; CHECK-NEXT: v_writelane_b32 v3, s12, 48 +; CHECK-NEXT: v_writelane_b32 v3, s13, 49 +; CHECK-NEXT: v_writelane_b32 v3, s14, 50 +; CHECK-NEXT: v_writelane_b32 v3, s15, 51 +; CHECK-NEXT: v_writelane_b32 v3, s16, 52 +; CHECK-NEXT: v_writelane_b32 v3, s17, 53 +; CHECK-NEXT: v_writelane_b32 v3, s18, 54 +; CHECK-NEXT: v_writelane_b32 v3, s19, 55 +; CHECK-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v3, s52, 56 +; CHECK-NEXT: v_writelane_b32 v2, s60, 0 +; CHECK-NEXT: v_writelane_b32 v3, s53, 57 +; CHECK-NEXT: v_writelane_b32 v2, s61, 1 +; CHECK-NEXT: v_writelane_b32 v3, s54, 58 +; CHECK-NEXT: v_writelane_b32 v2, s62, 2 +; CHECK-NEXT: v_writelane_b32 v3, s55, 59 +; CHECK-NEXT: v_writelane_b32 v2, s63, 3 +; CHECK-NEXT: v_writelane_b32 v3, s56, 60 +; CHECK-NEXT: v_writelane_b32 v2, s64, 4 +; CHECK-NEXT: v_writelane_b32 v3, s57, 61 +; CHECK-NEXT: v_writelane_b32 v2, s65, 5 +; CHECK-NEXT: v_writelane_b32 v3, s58, 62 +; CHECK-NEXT: v_writelane_b32 v2, s66, 6 +; CHECK-NEXT: v_writelane_b32 v3, s59, 63 +; CHECK-NEXT: v_writelane_b32 v2, s67, 7 ; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 @@ -219,109 +219,108 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 16 -; CHECK-NEXT: v_readlane_b32 s44, v4, 24 -; CHECK-NEXT: v_readlane_b32 s45, v4, 25 -; CHECK-NEXT: v_readlane_b32 s46, v4, 26 -; CHECK-NEXT: v_readlane_b32 s47, v4, 27 -; CHECK-NEXT: v_readlane_b32 s48, v4, 28 -; CHECK-NEXT: v_readlane_b32 s49, v4, 29 -; CHECK-NEXT: v_readlane_b32 s50, v4, 30 -; CHECK-NEXT: v_readlane_b32 s51, v4, 31 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_readlane_b32 s37, v4, 17 -; CHECK-NEXT: v_readlane_b32 s38, v4, 18 -; CHECK-NEXT: v_readlane_b32 s39, v4, 19 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 +; CHECK-NEXT: image_sample_lz v4, v[0:1], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s36, v3, 16 +; CHECK-NEXT: v_readlane_b32 s44, v3, 24 +; CHECK-NEXT: v_readlane_b32 s45, v3, 25 +; CHECK-NEXT: v_readlane_b32 s46, v3, 26 +; CHECK-NEXT: v_readlane_b32 s47, v3, 27 +; CHECK-NEXT: v_readlane_b32 s48, v3, 28 +; CHECK-NEXT: v_readlane_b32 s49, v3, 29 +; CHECK-NEXT: v_readlane_b32 s50, v3, 30 +; CHECK-NEXT: v_readlane_b32 s51, v3, 31 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, v5 +; CHECK-NEXT: v_readlane_b32 s37, v3, 17 +; CHECK-NEXT: v_readlane_b32 s38, v3, 18 +; CHECK-NEXT: v_readlane_b32 s39, v3, 19 ; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v4, 20 -; CHECK-NEXT: v_readlane_b32 s41, v4, 21 -; CHECK-NEXT: v_readlane_b32 s42, v4, 22 -; CHECK-NEXT: v_readlane_b32 s43, v4, 23 +; CHECK-NEXT: v_readlane_b32 s40, v3, 20 +; CHECK-NEXT: v_readlane_b32 s41, v3, 21 +; CHECK-NEXT: v_readlane_b32 s42, v3, 22 +; CHECK-NEXT: v_readlane_b32 s43, v3, 23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[4:6], off, s[8:11], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v4, 40 -; CHECK-NEXT: v_readlane_b32 s53, v4, 41 -; CHECK-NEXT: v_readlane_b32 s54, v4, 42 -; CHECK-NEXT: v_readlane_b32 s55, v4, 43 -; CHECK-NEXT: v_readlane_b32 s56, v4, 44 -; CHECK-NEXT: v_readlane_b32 s57, v4, 45 -; CHECK-NEXT: v_readlane_b32 s58, v4, 46 -; CHECK-NEXT: v_readlane_b32 s59, v4, 47 -; CHECK-NEXT: v_readlane_b32 s60, v4, 48 -; CHECK-NEXT: v_readlane_b32 s61, v4, 49 -; CHECK-NEXT: v_readlane_b32 s62, v4, 50 -; CHECK-NEXT: v_readlane_b32 s63, v4, 51 -; CHECK-NEXT: v_readlane_b32 s64, v4, 52 -; CHECK-NEXT: v_readlane_b32 s65, v4, 53 -; CHECK-NEXT: v_readlane_b32 s66, v4, 54 -; CHECK-NEXT: v_readlane_b32 s67, v4, 55 +; CHECK-NEXT: v_readlane_b32 s52, v3, 40 +; CHECK-NEXT: v_readlane_b32 s53, v3, 41 +; CHECK-NEXT: v_readlane_b32 s54, v3, 42 +; CHECK-NEXT: v_readlane_b32 s55, v3, 43 +; CHECK-NEXT: v_readlane_b32 s56, v3, 44 +; CHECK-NEXT: v_readlane_b32 s57, v3, 45 +; CHECK-NEXT: v_readlane_b32 s58, v3, 46 +; CHECK-NEXT: v_readlane_b32 s59, v3, 47 +; CHECK-NEXT: v_readlane_b32 s60, v3, 48 +; CHECK-NEXT: v_readlane_b32 s61, v3, 49 +; CHECK-NEXT: v_readlane_b32 s62, v3, 50 +; CHECK-NEXT: v_readlane_b32 s63, v3, 51 +; CHECK-NEXT: v_readlane_b32 s64, v3, 52 +; CHECK-NEXT: v_readlane_b32 s65, v3, 53 +; CHECK-NEXT: v_readlane_b32 s66, v3, 54 +; CHECK-NEXT: v_readlane_b32 s67, v3, 55 ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_readlane_b32 s36, v4, 56 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_readlane_b32 s36, v3, 56 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: v_readlane_b32 s37, v4, 57 -; CHECK-NEXT: v_readlane_b32 s38, v4, 58 -; CHECK-NEXT: v_readlane_b32 s39, v4, 59 -; CHECK-NEXT: v_readlane_b32 s40, v4, 60 -; CHECK-NEXT: v_readlane_b32 s41, v4, 61 -; CHECK-NEXT: v_readlane_b32 s42, v4, 62 -; CHECK-NEXT: v_readlane_b32 s43, v4, 63 -; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 -; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_readlane_b32 s37, v3, 57 +; CHECK-NEXT: v_readlane_b32 s38, v3, 58 +; CHECK-NEXT: v_readlane_b32 s39, v3, 59 +; CHECK-NEXT: v_readlane_b32 s40, v3, 60 +; CHECK-NEXT: v_readlane_b32 s41, v3, 61 +; CHECK-NEXT: v_readlane_b32 s42, v3, 62 +; CHECK-NEXT: v_readlane_b32 s43, v3, 63 +; CHECK-NEXT: v_readlane_b32 s44, v2, 0 +; CHECK-NEXT: v_readlane_b32 s45, v2, 1 +; CHECK-NEXT: v_readlane_b32 s46, v2, 2 +; CHECK-NEXT: v_readlane_b32 s47, v2, 3 +; CHECK-NEXT: v_readlane_b32 s48, v2, 4 +; CHECK-NEXT: v_readlane_b32 s49, v2, 5 +; CHECK-NEXT: v_readlane_b32 s50, v2, 6 +; CHECK-NEXT: v_readlane_b32 s51, v2, 7 +; CHECK-NEXT: image_sample_lz v1, v[4:5], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v2, v[4:5], s[52:59], s[8:11] dmask:0x1 ; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v3, 0 -; CHECK-NEXT: v_readlane_b32 s45, v3, 1 -; CHECK-NEXT: v_readlane_b32 s46, v3, 2 -; CHECK-NEXT: v_readlane_b32 s47, v3, 3 -; CHECK-NEXT: v_readlane_b32 s48, v3, 4 -; CHECK-NEXT: v_readlane_b32 s49, v3, 5 -; CHECK-NEXT: v_readlane_b32 s50, v3, 6 -; CHECK-NEXT: v_readlane_b32 s51, v3, 7 ; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] ; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; CHECK-NEXT: ; kill: killed $vgpr4_vgpr5 ; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_sub_f32_e32 v1, v2, v1 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB0_8: ; %bb33 @@ -334,46 +333,44 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v8, 33 -; CHECK-NEXT: v_readlane_b32 s66, v8, 32 -; CHECK-NEXT: v_readlane_b32 s65, v8, 31 -; CHECK-NEXT: v_readlane_b32 s64, v8, 30 -; CHECK-NEXT: v_readlane_b32 s63, v8, 29 -; CHECK-NEXT: v_readlane_b32 s62, v8, 28 -; CHECK-NEXT: v_readlane_b32 s61, v8, 27 -; CHECK-NEXT: v_readlane_b32 s60, v8, 26 -; CHECK-NEXT: v_readlane_b32 s59, v8, 25 -; CHECK-NEXT: v_readlane_b32 s58, v8, 24 -; CHECK-NEXT: v_readlane_b32 s57, v8, 23 -; CHECK-NEXT: v_readlane_b32 s56, v8, 22 -; CHECK-NEXT: v_readlane_b32 s55, v8, 21 -; CHECK-NEXT: v_readlane_b32 s54, v8, 20 -; CHECK-NEXT: v_readlane_b32 s53, v8, 19 -; CHECK-NEXT: v_readlane_b32 s52, v8, 18 -; CHECK-NEXT: v_readlane_b32 s51, v8, 17 -; CHECK-NEXT: v_readlane_b32 s50, v8, 16 -; CHECK-NEXT: v_readlane_b32 s49, v8, 15 -; CHECK-NEXT: v_readlane_b32 s48, v8, 14 -; CHECK-NEXT: v_readlane_b32 s47, v8, 13 -; CHECK-NEXT: v_readlane_b32 s46, v8, 12 -; CHECK-NEXT: v_readlane_b32 s45, v8, 11 -; CHECK-NEXT: v_readlane_b32 s44, v8, 10 -; CHECK-NEXT: v_readlane_b32 s43, v8, 9 -; CHECK-NEXT: v_readlane_b32 s42, v8, 8 -; CHECK-NEXT: v_readlane_b32 s41, v8, 7 -; CHECK-NEXT: v_readlane_b32 s40, v8, 6 -; CHECK-NEXT: v_readlane_b32 s39, v8, 5 -; CHECK-NEXT: v_readlane_b32 s38, v8, 4 -; CHECK-NEXT: v_readlane_b32 s37, v8, 3 -; CHECK-NEXT: v_readlane_b32 s36, v8, 2 -; CHECK-NEXT: v_readlane_b32 s31, v8, 1 -; CHECK-NEXT: v_readlane_b32 s30, v8, 0 -; CHECK-NEXT: ; kill: killed $vgpr4 -; CHECK-NEXT: ; kill: killed $vgpr3 +; CHECK-NEXT: v_readlane_b32 s67, v7, 33 +; CHECK-NEXT: v_readlane_b32 s66, v7, 32 +; CHECK-NEXT: v_readlane_b32 s65, v7, 31 +; CHECK-NEXT: v_readlane_b32 s64, v7, 30 +; CHECK-NEXT: v_readlane_b32 s63, v7, 29 +; CHECK-NEXT: v_readlane_b32 s62, v7, 28 +; CHECK-NEXT: v_readlane_b32 s61, v7, 27 +; CHECK-NEXT: v_readlane_b32 s60, v7, 26 +; CHECK-NEXT: v_readlane_b32 s59, v7, 25 +; CHECK-NEXT: v_readlane_b32 s58, v7, 24 +; CHECK-NEXT: v_readlane_b32 s57, v7, 23 +; CHECK-NEXT: v_readlane_b32 s56, v7, 22 +; CHECK-NEXT: v_readlane_b32 s55, v7, 21 +; CHECK-NEXT: v_readlane_b32 s54, v7, 20 +; CHECK-NEXT: v_readlane_b32 s53, v7, 19 +; CHECK-NEXT: v_readlane_b32 s52, v7, 18 +; CHECK-NEXT: v_readlane_b32 s51, v7, 17 +; CHECK-NEXT: v_readlane_b32 s50, v7, 16 +; CHECK-NEXT: v_readlane_b32 s49, v7, 15 +; CHECK-NEXT: v_readlane_b32 s48, v7, 14 +; CHECK-NEXT: v_readlane_b32 s47, v7, 13 +; CHECK-NEXT: v_readlane_b32 s46, v7, 12 +; CHECK-NEXT: v_readlane_b32 s45, v7, 11 +; CHECK-NEXT: v_readlane_b32 s44, v7, 10 +; CHECK-NEXT: v_readlane_b32 s43, v7, 9 +; CHECK-NEXT: v_readlane_b32 s42, v7, 8 +; CHECK-NEXT: v_readlane_b32 s41, v7, 7 +; CHECK-NEXT: v_readlane_b32 s40, v7, 6 +; CHECK-NEXT: v_readlane_b32 s39, v7, 5 +; CHECK-NEXT: v_readlane_b32 s38, v7, 4 +; CHECK-NEXT: v_readlane_b32 s37, v7, 3 +; CHECK-NEXT: v_readlane_b32 s36, v7, 2 +; CHECK-NEXT: v_readlane_b32 s31, v7, 1 +; CHECK-NEXT: v_readlane_b32 s30, v7, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 76b007c22b699..9b245891ce7ae 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 1acbb09118280..fc67a07523c03 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,17 +13,13 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v0, s8, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 ; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 @@ -78,24 +74,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b32 v1, v2 -; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: ds_write_b32 v0, v1 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm %arr = alloca < 1339 x i32>, align 8192, addrspace(5) %cmp = icmp ne i32 %val, 0 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index b5ee6689f8dc3..b75fa7ea181ab 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -144,30 +144,26 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v4, v3 +; W64-O0-NEXT: v_mov_b32_e32 v5, v2 +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 @@ -238,7 +234,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -495,34 +490,34 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v7, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v9, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v8, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v14, v5 -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 ; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_mov_b32_e32 v16, v4 +; W64-O0-NEXT: v_mov_b32_e32 v15, v4 +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v16, v3 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -532,34 +527,35 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 @@ -613,12 +609,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: @@ -681,37 +677,37 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v6, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v6, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1021,61 +1017,59 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v8, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v10, v3 -; W64-O0-NEXT: v_mov_b32_e32 v11, v2 -; W64-O0-NEXT: v_mov_b32_e32 v13, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v9, v3 +; W64-O0-NEXT: v_mov_b32_e32 v10, v2 +; W64-O0-NEXT: v_mov_b32_e32 v11, v1 +; W64-O0-NEXT: v_mov_b32_e32 v5, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v9 -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v8 +; W64-O0-NEXT: v_mov_b32_e32 v2, v6 +; W64-O0-NEXT: v_mov_b32_e32 v3, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v7, v13 -; W64-O0-NEXT: v_mov_b32_e32 v8, v11 -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v11 +; W64-O0-NEXT: v_mov_b32_e32 v7, v10 +; W64-O0-NEXT: v_mov_b32_e32 v8, v9 +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v12 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v12 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 @@ -1250,22 +1244,22 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: v_readlane_b32 s4, v3, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v3, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 162c47f879465..bca6903e16a2d 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -140,48 +140,41 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v2 +; W64-O0-NEXT: v_mov_b32_e32 v2, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v5, v7 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v6 +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v3 -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v2 +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 @@ -235,12 +228,12 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: @@ -251,13 +244,12 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 ; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: ; kill: killed $vgpr1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -509,45 +501,44 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v14, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v5, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v4, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 -; W64-O0-NEXT: v_mov_b32_e32 v6, v15 -; W64-O0-NEXT: v_mov_b32_e32 v7, v14 +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v15, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v15 +; W64-O0-NEXT: v_mov_b32_e32 v6, v14 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v9, v14 +; W64-O0-NEXT: v_mov_b32_e32 v14, v8 +; W64-O0-NEXT: v_mov_b32_e32 v8, v14 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 -; W64-O0-NEXT: v_mov_b32_e32 v16, v6 +; W64-O0-NEXT: v_mov_b32_e32 v14, v8 +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: v_mov_b32_e32 v16, v5 ; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -555,43 +546,43 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: v_mov_b32_e32 v7, v9 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v8 +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -599,6 +590,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 @@ -652,12 +644,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: @@ -720,37 +712,37 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 ; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 ; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v6, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v6, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1060,46 +1052,42 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v6, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v13, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v10, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v9, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v8, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v14 -; W64-O0-NEXT: v_mov_b32_e32 v6, v13 +; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v10, v3 +; W64-O0-NEXT: v_mov_b32_e32 v3, v10 +; W64-O0-NEXT: v_mov_b32_e32 v5, v9 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 -; W64-O0-NEXT: v_mov_b32_e32 v13, v9 +; W64-O0-NEXT: v_mov_b32_e32 v9, v6 +; W64-O0-NEXT: v_mov_b32_e32 v6, v9 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v13 -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v11, v4 +; W64-O0-NEXT: v_mov_b32_e32 v9, v6 +; W64-O0-NEXT: v_mov_b32_e32 v10, v5 +; W64-O0-NEXT: v_mov_b32_e32 v11, v3 ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -1107,31 +1095,32 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v12 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 +; W64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 @@ -1327,22 +1316,22 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: v_readlane_b32 s4, v3, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v3, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 45fbaaabc65b5..807325801afda 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,9 +11,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %26 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3604490 /* regdef:VReg_64 */, def %23 ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) @@ -36,9 +36,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3604490 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -60,9 +60,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %25 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:VReg_64_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -83,9 +83,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index 5b0354e63c236..f2a595be85d7c 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -17,195 +17,195 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b32 s95, 0xe8f000 ; GCN-NEXT: s_add_u32 s92, s92, s9 ; GCN-NEXT: s_addc_u32 s93, s93, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: v_writelane_b32 v2, s6, 2 -; GCN-NEXT: v_writelane_b32 v2, s7, 3 -; GCN-NEXT: v_writelane_b32 v2, s8, 4 -; GCN-NEXT: v_writelane_b32 v2, s9, 5 -; GCN-NEXT: v_writelane_b32 v2, s10, 6 -; GCN-NEXT: v_writelane_b32 v2, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 8 -; GCN-NEXT: v_writelane_b32 v2, s5, 9 -; GCN-NEXT: v_writelane_b32 v2, s6, 10 -; GCN-NEXT: v_writelane_b32 v2, s7, 11 -; GCN-NEXT: v_writelane_b32 v2, s8, 12 -; GCN-NEXT: v_writelane_b32 v2, s9, 13 -; GCN-NEXT: v_writelane_b32 v2, s10, 14 -; GCN-NEXT: v_writelane_b32 v2, s11, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 16 -; GCN-NEXT: v_writelane_b32 v2, s5, 17 -; GCN-NEXT: v_writelane_b32 v2, s6, 18 -; GCN-NEXT: v_writelane_b32 v2, s7, 19 -; GCN-NEXT: v_writelane_b32 v2, s8, 20 -; GCN-NEXT: v_writelane_b32 v2, s9, 21 -; GCN-NEXT: v_writelane_b32 v2, s10, 22 -; GCN-NEXT: v_writelane_b32 v2, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 24 -; GCN-NEXT: v_writelane_b32 v2, s5, 25 -; GCN-NEXT: v_writelane_b32 v2, s6, 26 -; GCN-NEXT: v_writelane_b32 v2, s7, 27 -; GCN-NEXT: v_writelane_b32 v2, s8, 28 -; GCN-NEXT: v_writelane_b32 v2, s9, 29 -; GCN-NEXT: v_writelane_b32 v2, s10, 30 -; GCN-NEXT: v_writelane_b32 v2, s11, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 32 -; GCN-NEXT: v_writelane_b32 v2, s5, 33 -; GCN-NEXT: v_writelane_b32 v2, s6, 34 -; GCN-NEXT: v_writelane_b32 v2, s7, 35 -; GCN-NEXT: v_writelane_b32 v2, s8, 36 -; GCN-NEXT: v_writelane_b32 v2, s9, 37 -; GCN-NEXT: v_writelane_b32 v2, s10, 38 -; GCN-NEXT: v_writelane_b32 v2, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 40 -; GCN-NEXT: v_writelane_b32 v2, s5, 41 -; GCN-NEXT: v_writelane_b32 v2, s6, 42 -; GCN-NEXT: v_writelane_b32 v2, s7, 43 -; GCN-NEXT: v_writelane_b32 v2, s8, 44 -; GCN-NEXT: v_writelane_b32 v2, s9, 45 -; GCN-NEXT: v_writelane_b32 v2, s10, 46 -; GCN-NEXT: v_writelane_b32 v2, s11, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 48 -; GCN-NEXT: v_writelane_b32 v2, s5, 49 -; GCN-NEXT: v_writelane_b32 v2, s6, 50 -; GCN-NEXT: v_writelane_b32 v2, s7, 51 -; GCN-NEXT: v_writelane_b32 v2, s8, 52 -; GCN-NEXT: v_writelane_b32 v2, s9, 53 -; GCN-NEXT: v_writelane_b32 v2, s10, 54 -; GCN-NEXT: v_writelane_b32 v2, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 56 -; GCN-NEXT: v_writelane_b32 v2, s5, 57 -; GCN-NEXT: v_writelane_b32 v2, s6, 58 -; GCN-NEXT: v_writelane_b32 v2, s7, 59 -; GCN-NEXT: v_writelane_b32 v2, s8, 60 -; GCN-NEXT: v_writelane_b32 v2, s9, 61 -; GCN-NEXT: v_writelane_b32 v2, s10, 62 -; GCN-NEXT: v_writelane_b32 v2, s11, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 56 +; GCN-NEXT: v_writelane_b32 v0, s5, 57 +; GCN-NEXT: v_writelane_b32 v0, s6, 58 +; GCN-NEXT: v_writelane_b32 v0, s7, 59 +; GCN-NEXT: v_writelane_b32 v0, s8, 60 +; GCN-NEXT: v_writelane_b32 v0, s9, 61 +; GCN-NEXT: v_writelane_b32 v0, s10, 62 +; GCN-NEXT: v_writelane_b32 v0, s11, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 8 -; GCN-NEXT: v_writelane_b32 v1, s5, 9 -; GCN-NEXT: v_writelane_b32 v1, s6, 10 -; GCN-NEXT: v_writelane_b32 v1, s7, 11 -; GCN-NEXT: v_writelane_b32 v1, s8, 12 -; GCN-NEXT: v_writelane_b32 v1, s9, 13 -; GCN-NEXT: v_writelane_b32 v1, s10, 14 -; GCN-NEXT: v_writelane_b32 v1, s11, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 24 -; GCN-NEXT: v_writelane_b32 v1, s5, 25 -; GCN-NEXT: v_writelane_b32 v1, s6, 26 -; GCN-NEXT: v_writelane_b32 v1, s7, 27 -; GCN-NEXT: v_writelane_b32 v1, s8, 28 -; GCN-NEXT: v_writelane_b32 v1, s9, 29 -; GCN-NEXT: v_writelane_b32 v1, s10, 30 -; GCN-NEXT: v_writelane_b32 v1, s11, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 40 -; GCN-NEXT: v_writelane_b32 v1, s5, 41 -; GCN-NEXT: v_writelane_b32 v1, s6, 42 -; GCN-NEXT: v_writelane_b32 v1, s7, 43 -; GCN-NEXT: v_writelane_b32 v1, s8, 44 -; GCN-NEXT: v_writelane_b32 v1, s9, 45 -; GCN-NEXT: v_writelane_b32 v1, s10, 46 -; GCN-NEXT: v_writelane_b32 v1, s11, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 56 -; GCN-NEXT: v_writelane_b32 v1, s5, 57 -; GCN-NEXT: v_writelane_b32 v1, s6, 58 -; GCN-NEXT: v_writelane_b32 v1, s7, 59 -; GCN-NEXT: v_writelane_b32 v1, s8, 60 -; GCN-NEXT: v_writelane_b32 v1, s9, 61 -; GCN-NEXT: v_writelane_b32 v1, s10, 62 -; GCN-NEXT: v_writelane_b32 v1, s11, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 56 +; GCN-NEXT: v_writelane_b32 v0, s5, 57 +; GCN-NEXT: v_writelane_b32 v0, s6, 58 +; GCN-NEXT: v_writelane_b32 v0, s7, 59 +; GCN-NEXT: v_writelane_b32 v0, s8, 60 +; GCN-NEXT: v_writelane_b32 v0, s9, 61 +; GCN-NEXT: v_writelane_b32 v0, s10, 62 +; GCN-NEXT: v_writelane_b32 v0, s11, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -422,18 +422,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -490,91 +478,91 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -697,14 +685,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -741,17 +721,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -765,87 +737,87 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -946,14 +918,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: ; use s[0:1] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 @@ -993,17 +957,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s9 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -1017,87 +973,87 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -1204,14 +1160,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: ; use v0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 1be041c8dc9b0..d60ebbe0474c5 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -19,7 +19,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[18:19] -; GFX906-NEXT: ; implicit-def: $vgpr2 +; GFX906-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX906-NEXT: s_mov_b32 s21, s15 ; GFX906-NEXT: v_writelane_b32 v2, s6, 0 ; GFX906-NEXT: v_writelane_b32 v2, s7, 1 @@ -300,8 +300,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s14, v40, 3 ; GFX906-NEXT: v_readlane_b32 s15, v40, 2 ; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload @@ -339,7 +337,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 -; GFX906-NEXT: ; kill: killed $vgpr40 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 @@ -406,7 +403,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] -; GFX908-NEXT: ; implicit-def: $vgpr2 +; GFX908-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX908-NEXT: s_mov_b32 s21, s15 ; GFX908-NEXT: v_writelane_b32 v2, s6, 0 ; GFX908-NEXT: v_writelane_b32 v2, s7, 1 @@ -681,8 +678,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s14, v40, 3 ; GFX908-NEXT: v_readlane_b32 s15, v40, 2 ; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload @@ -754,7 +749,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload -; GFX908-NEXT: ; kill: killed $vgpr40 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s34, v0 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6ba66ccf71868..3733bd5b3b80d 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -247,105 +247,103 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7] -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_ashrrev_i64 v[7:8], s4, v[2:3] +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v4, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v11, v9, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v2, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v4, v8, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v5 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v4, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec @@ -353,7 +351,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill @@ -388,13 +386,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v8, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] @@ -1196,247 +1195,244 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], s4, v[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v0, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v1, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[17:18] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_add3_u32 v2, v0, v2, v3 +; GFX9-O0-NEXT: v_add3_u32 v8, v0, v5, v8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 +; GFX9-O0-NEXT: v_mul_lo_u32 v9, v8, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v17, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 -; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v0 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v2, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v20 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v14, v14, v0 +; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v11, v12 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v16, s[6:7], v14, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v8, v9, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v12 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v6, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v20, v9, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v21 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v23, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v0, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v22, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v6, v19, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v19, v19, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 ; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v21, v20, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v0, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v19 +; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v19, s[6:7], v1, v19, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s4, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v22 +; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v5 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v19 -; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[0:1] -; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v22 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v19, v20 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v6, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 -; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v5, v6 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[18:19], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 +; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v6 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v12 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 @@ -1444,47 +1440,46 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_xor_b32_e64 v9, v6, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 @@ -1707,25 +1702,25 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec @@ -1784,13 +1779,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v8, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_or_b32_e64 v1, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7] @@ -1987,27 +1983,27 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(6) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -2022,22 +2018,22 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_9 ; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -2077,27 +2073,27 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2107,30 +2103,30 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] @@ -2270,24 +2266,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 ; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 @@ -2297,42 +2293,42 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2416,12 +2412,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 @@ -2432,30 +2428,30 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -2497,14 +2493,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] @@ -2550,12 +2546,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 @@ -2570,18 +2566,18 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -2594,265 +2590,261 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mul_lo_u32 v4, v5, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v6, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v6, v2, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_mul_lo_u32 v5, v6, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 -; GFX9-O0-NEXT: v_mad_u64_u32 v[13:14], s[6:7], v7, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v5 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_lshlrev_b64 v[3:4], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v8 -; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 -; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v12, v3, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mul_lo_u32 v10, v10, v4 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v10, v11 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v8, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v5, v7, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v16 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v18, v10, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 +; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[6:7], v7, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v8, v15, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v7, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_and_b32_e64 v17, v16, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v5, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr7 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v6 -; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v19, v6, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v18, v5, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v15, s[6:7], v6, v15, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 -; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v8, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v5, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], s4, v[4:5] +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v14, v15 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v6, v7 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v8 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v7 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v7 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[7:8] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshrrev_b64 v[5:6], s4, v[5:6] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 5f291489848fe..dd7f6bea545ad 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -8,7 +8,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-LABEL: kernel0: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART @@ -22,46 +21,47 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 0 +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: v_writelane_b32 v23, s3, 1 +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: v_writelane_b32 v0, s3, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 2 -; CHECK-NEXT: v_writelane_b32 v23, s5, 3 -; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 2 +; CHECK-NEXT: v_writelane_b32 v0, s5, 3 +; CHECK-NEXT: v_writelane_b32 v0, s6, 4 +; CHECK-NEXT: v_writelane_b32 v0, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 14 -; CHECK-NEXT: v_writelane_b32 v23, s5, 15 -; CHECK-NEXT: v_writelane_b32 v23, s6, 16 -; CHECK-NEXT: v_writelane_b32 v23, s7, 17 -; CHECK-NEXT: v_writelane_b32 v23, s8, 18 -; CHECK-NEXT: v_writelane_b32 v23, s9, 19 -; CHECK-NEXT: v_writelane_b32 v23, s10, 20 -; CHECK-NEXT: v_writelane_b32 v23, s11, 21 -; CHECK-NEXT: v_writelane_b32 v23, s12, 22 -; CHECK-NEXT: v_writelane_b32 v23, s13, 23 -; CHECK-NEXT: v_writelane_b32 v23, s14, 24 -; CHECK-NEXT: v_writelane_b32 v23, s15, 25 -; CHECK-NEXT: v_writelane_b32 v23, s16, 26 -; CHECK-NEXT: v_writelane_b32 v23, s17, 27 -; CHECK-NEXT: v_writelane_b32 v23, s18, 28 -; CHECK-NEXT: v_writelane_b32 v23, s19, 29 +; CHECK-NEXT: v_writelane_b32 v0, s4, 14 +; CHECK-NEXT: v_writelane_b32 v0, s5, 15 +; CHECK-NEXT: v_writelane_b32 v0, s6, 16 +; CHECK-NEXT: v_writelane_b32 v0, s7, 17 +; CHECK-NEXT: v_writelane_b32 v0, s8, 18 +; CHECK-NEXT: v_writelane_b32 v0, s9, 19 +; CHECK-NEXT: v_writelane_b32 v0, s10, 20 +; CHECK-NEXT: v_writelane_b32 v0, s11, 21 +; CHECK-NEXT: v_writelane_b32 v0, s12, 22 +; CHECK-NEXT: v_writelane_b32 v0, s13, 23 +; CHECK-NEXT: v_writelane_b32 v0, s14, 24 +; CHECK-NEXT: v_writelane_b32 v0, s15, 25 +; CHECK-NEXT: v_writelane_b32 v0, s16, 26 +; CHECK-NEXT: v_writelane_b32 v0, s17, 27 +; CHECK-NEXT: v_writelane_b32 v0, s18, 28 +; CHECK-NEXT: v_writelane_b32 v0, s19, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -71,14 +71,14 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 30 -; CHECK-NEXT: v_writelane_b32 v23, s5, 31 -; CHECK-NEXT: v_writelane_b32 v23, s6, 32 -; CHECK-NEXT: v_writelane_b32 v23, s7, 33 -; CHECK-NEXT: v_writelane_b32 v23, s8, 34 -; CHECK-NEXT: v_writelane_b32 v23, s9, 35 -; CHECK-NEXT: v_writelane_b32 v23, s10, 36 -; CHECK-NEXT: v_writelane_b32 v23, s11, 37 +; CHECK-NEXT: v_writelane_b32 v0, s4, 30 +; CHECK-NEXT: v_writelane_b32 v0, s5, 31 +; CHECK-NEXT: v_writelane_b32 v0, s6, 32 +; CHECK-NEXT: v_writelane_b32 v0, s7, 33 +; CHECK-NEXT: v_writelane_b32 v0, s8, 34 +; CHECK-NEXT: v_writelane_b32 v0, s9, 35 +; CHECK-NEXT: v_writelane_b32 v0, s10, 36 +; CHECK-NEXT: v_writelane_b32 v0, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART @@ -96,161 +96,159 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 38 -; CHECK-NEXT: v_writelane_b32 v23, s1, 39 -; CHECK-NEXT: v_writelane_b32 v23, s2, 40 -; CHECK-NEXT: v_writelane_b32 v23, s3, 41 -; CHECK-NEXT: v_writelane_b32 v23, s4, 42 -; CHECK-NEXT: v_writelane_b32 v23, s5, 43 -; CHECK-NEXT: v_writelane_b32 v23, s6, 44 -; CHECK-NEXT: v_writelane_b32 v23, s7, 45 -; CHECK-NEXT: v_writelane_b32 v23, s8, 46 -; CHECK-NEXT: v_writelane_b32 v23, s9, 47 -; CHECK-NEXT: v_writelane_b32 v23, s10, 48 -; CHECK-NEXT: v_writelane_b32 v23, s11, 49 -; CHECK-NEXT: v_writelane_b32 v23, s12, 50 -; CHECK-NEXT: v_writelane_b32 v23, s13, 51 -; CHECK-NEXT: v_writelane_b32 v23, s14, 52 -; CHECK-NEXT: v_writelane_b32 v23, s15, 53 +; CHECK-NEXT: v_writelane_b32 v0, s0, 38 +; CHECK-NEXT: v_writelane_b32 v0, s1, 39 +; CHECK-NEXT: v_writelane_b32 v0, s2, 40 +; CHECK-NEXT: v_writelane_b32 v0, s3, 41 +; CHECK-NEXT: v_writelane_b32 v0, s4, 42 +; CHECK-NEXT: v_writelane_b32 v0, s5, 43 +; CHECK-NEXT: v_writelane_b32 v0, s6, 44 +; CHECK-NEXT: v_writelane_b32 v0, s7, 45 +; CHECK-NEXT: v_writelane_b32 v0, s8, 46 +; CHECK-NEXT: v_writelane_b32 v0, s9, 47 +; CHECK-NEXT: v_writelane_b32 v0, s10, 48 +; CHECK-NEXT: v_writelane_b32 v0, s11, 49 +; CHECK-NEXT: v_writelane_b32 v0, s12, 50 +; CHECK-NEXT: v_writelane_b32 v0, s13, 51 +; CHECK-NEXT: v_writelane_b32 v0, s14, 52 +; CHECK-NEXT: v_writelane_b32 v0, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 54 -; CHECK-NEXT: v_writelane_b32 v23, s1, 55 -; CHECK-NEXT: v_writelane_b32 v23, s2, 56 -; CHECK-NEXT: v_writelane_b32 v23, s3, 57 +; CHECK-NEXT: v_writelane_b32 v0, s0, 54 +; CHECK-NEXT: v_writelane_b32 v0, s1, 55 +; CHECK-NEXT: v_writelane_b32 v0, s2, 56 +; CHECK-NEXT: v_writelane_b32 v0, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 58 -; CHECK-NEXT: v_writelane_b32 v23, s1, 59 -; CHECK-NEXT: v_writelane_b32 v23, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s3, 61 -; CHECK-NEXT: v_writelane_b32 v23, s4, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 0 -; CHECK-NEXT: v_writelane_b32 v23, s5, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 1 +; CHECK-NEXT: v_writelane_b32 v0, s0, 58 +; CHECK-NEXT: v_writelane_b32 v0, s1, 59 +; CHECK-NEXT: v_writelane_b32 v0, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v0, s3, 61 +; CHECK-NEXT: v_writelane_b32 v0, s4, 62 +; CHECK-NEXT: v_writelane_b32 v1, s6, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 63 +; CHECK-NEXT: v_writelane_b32 v1, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 2 -; CHECK-NEXT: v_writelane_b32 v0, s1, 3 -; CHECK-NEXT: v_writelane_b32 v0, s2, 4 -; CHECK-NEXT: v_writelane_b32 v0, s3, 5 -; CHECK-NEXT: v_writelane_b32 v0, s4, 6 -; CHECK-NEXT: v_writelane_b32 v0, s5, 7 -; CHECK-NEXT: v_writelane_b32 v0, s6, 8 -; CHECK-NEXT: v_writelane_b32 v0, s7, 9 -; CHECK-NEXT: v_writelane_b32 v0, s8, 10 -; CHECK-NEXT: v_writelane_b32 v0, s9, 11 -; CHECK-NEXT: v_writelane_b32 v0, s10, 12 -; CHECK-NEXT: v_writelane_b32 v0, s11, 13 -; CHECK-NEXT: v_writelane_b32 v0, s12, 14 -; CHECK-NEXT: v_writelane_b32 v0, s13, 15 -; CHECK-NEXT: v_writelane_b32 v0, s14, 16 -; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: v_writelane_b32 v1, s0, 2 +; CHECK-NEXT: v_writelane_b32 v1, s1, 3 +; CHECK-NEXT: v_writelane_b32 v1, s2, 4 +; CHECK-NEXT: v_writelane_b32 v1, s3, 5 +; CHECK-NEXT: v_writelane_b32 v1, s4, 6 +; CHECK-NEXT: v_writelane_b32 v1, s5, 7 +; CHECK-NEXT: v_writelane_b32 v1, s6, 8 +; CHECK-NEXT: v_writelane_b32 v1, s7, 9 +; CHECK-NEXT: v_writelane_b32 v1, s8, 10 +; CHECK-NEXT: v_writelane_b32 v1, s9, 11 +; CHECK-NEXT: v_writelane_b32 v1, s10, 12 +; CHECK-NEXT: v_writelane_b32 v1, s11, 13 +; CHECK-NEXT: v_writelane_b32 v1, s12, 14 +; CHECK-NEXT: v_writelane_b32 v1, s13, 15 +; CHECK-NEXT: v_writelane_b32 v1, s14, 16 +; CHECK-NEXT: v_writelane_b32 v1, s15, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 18 -; CHECK-NEXT: v_writelane_b32 v0, s1, 19 +; CHECK-NEXT: v_writelane_b32 v1, s0, 18 +; CHECK-NEXT: v_writelane_b32 v1, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 20 -; CHECK-NEXT: v_writelane_b32 v0, s1, 21 -; CHECK-NEXT: v_writelane_b32 v0, s2, 22 -; CHECK-NEXT: v_writelane_b32 v0, s3, 23 +; CHECK-NEXT: v_writelane_b32 v1, s0, 20 +; CHECK-NEXT: v_writelane_b32 v1, s1, 21 +; CHECK-NEXT: v_writelane_b32 v1, s2, 22 +; CHECK-NEXT: v_writelane_b32 v1, s3, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 24 -; CHECK-NEXT: v_writelane_b32 v0, s1, 25 -; CHECK-NEXT: v_writelane_b32 v0, s2, 26 -; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: v_writelane_b32 v0, s4, 28 -; CHECK-NEXT: v_writelane_b32 v0, s5, 29 -; CHECK-NEXT: v_writelane_b32 v0, s6, 30 -; CHECK-NEXT: v_writelane_b32 v0, s7, 31 +; CHECK-NEXT: v_writelane_b32 v1, s0, 24 +; CHECK-NEXT: v_writelane_b32 v1, s1, 25 +; CHECK-NEXT: v_writelane_b32 v1, s2, 26 +; CHECK-NEXT: v_writelane_b32 v1, s3, 27 +; CHECK-NEXT: v_writelane_b32 v1, s4, 28 +; CHECK-NEXT: v_writelane_b32 v1, s5, 29 +; CHECK-NEXT: v_writelane_b32 v1, s6, 30 +; CHECK-NEXT: v_writelane_b32 v1, s7, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 32 -; CHECK-NEXT: v_writelane_b32 v0, s1, 33 -; CHECK-NEXT: v_writelane_b32 v0, s2, 34 -; CHECK-NEXT: v_writelane_b32 v0, s3, 35 -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: v_writelane_b32 v0, s12, 44 -; CHECK-NEXT: v_writelane_b32 v0, s13, 45 -; CHECK-NEXT: v_writelane_b32 v0, s14, 46 -; CHECK-NEXT: v_writelane_b32 v0, s15, 47 +; CHECK-NEXT: v_writelane_b32 v1, s0, 32 +; CHECK-NEXT: v_writelane_b32 v1, s1, 33 +; CHECK-NEXT: v_writelane_b32 v1, s2, 34 +; CHECK-NEXT: v_writelane_b32 v1, s3, 35 +; CHECK-NEXT: v_writelane_b32 v1, s4, 36 +; CHECK-NEXT: v_writelane_b32 v1, s5, 37 +; CHECK-NEXT: v_writelane_b32 v1, s6, 38 +; CHECK-NEXT: v_writelane_b32 v1, s7, 39 +; CHECK-NEXT: v_writelane_b32 v1, s8, 40 +; CHECK-NEXT: v_writelane_b32 v1, s9, 41 +; CHECK-NEXT: v_writelane_b32 v1, s10, 42 +; CHECK-NEXT: v_writelane_b32 v1, s11, 43 +; CHECK-NEXT: v_writelane_b32 v1, s12, 44 +; CHECK-NEXT: v_writelane_b32 v1, s13, 45 +; CHECK-NEXT: v_writelane_b32 v1, s14, 46 +; CHECK-NEXT: v_writelane_b32 v1, s15, 47 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %bb0 -; CHECK-NEXT: v_readlane_b32 s0, v23, 0 -; CHECK-NEXT: v_readlane_b32 s1, v23, 1 +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: v_readlane_b32 s1, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 6 -; CHECK-NEXT: v_readlane_b32 s1, v23, 7 -; CHECK-NEXT: v_readlane_b32 s2, v23, 8 -; CHECK-NEXT: v_readlane_b32 s3, v23, 9 -; CHECK-NEXT: v_readlane_b32 s4, v23, 10 -; CHECK-NEXT: v_readlane_b32 s5, v23, 11 -; CHECK-NEXT: v_readlane_b32 s6, v23, 12 -; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s0, v0, 6 +; CHECK-NEXT: v_readlane_b32 s1, v0, 7 +; CHECK-NEXT: v_readlane_b32 s2, v0, 8 +; CHECK-NEXT: v_readlane_b32 s3, v0, 9 +; CHECK-NEXT: v_readlane_b32 s4, v0, 10 +; CHECK-NEXT: v_readlane_b32 s5, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 12 +; CHECK-NEXT: v_readlane_b32 s7, v0, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 14 -; CHECK-NEXT: v_readlane_b32 s1, v23, 15 -; CHECK-NEXT: v_readlane_b32 s2, v23, 16 -; CHECK-NEXT: v_readlane_b32 s3, v23, 17 -; CHECK-NEXT: v_readlane_b32 s4, v23, 18 -; CHECK-NEXT: v_readlane_b32 s5, v23, 19 -; CHECK-NEXT: v_readlane_b32 s6, v23, 20 -; CHECK-NEXT: v_readlane_b32 s7, v23, 21 -; CHECK-NEXT: v_readlane_b32 s8, v23, 22 -; CHECK-NEXT: v_readlane_b32 s9, v23, 23 -; CHECK-NEXT: v_readlane_b32 s10, v23, 24 -; CHECK-NEXT: v_readlane_b32 s11, v23, 25 -; CHECK-NEXT: v_readlane_b32 s12, v23, 26 -; CHECK-NEXT: v_readlane_b32 s13, v23, 27 -; CHECK-NEXT: v_readlane_b32 s14, v23, 28 -; CHECK-NEXT: v_readlane_b32 s15, v23, 29 +; CHECK-NEXT: v_readlane_b32 s0, v0, 14 +; CHECK-NEXT: v_readlane_b32 s1, v0, 15 +; CHECK-NEXT: v_readlane_b32 s2, v0, 16 +; CHECK-NEXT: v_readlane_b32 s3, v0, 17 +; CHECK-NEXT: v_readlane_b32 s4, v0, 18 +; CHECK-NEXT: v_readlane_b32 s5, v0, 19 +; CHECK-NEXT: v_readlane_b32 s6, v0, 20 +; CHECK-NEXT: v_readlane_b32 s7, v0, 21 +; CHECK-NEXT: v_readlane_b32 s8, v0, 22 +; CHECK-NEXT: v_readlane_b32 s9, v0, 23 +; CHECK-NEXT: v_readlane_b32 s10, v0, 24 +; CHECK-NEXT: v_readlane_b32 s11, v0, 25 +; CHECK-NEXT: v_readlane_b32 s12, v0, 26 +; CHECK-NEXT: v_readlane_b32 s13, v0, 27 +; CHECK-NEXT: v_readlane_b32 s14, v0, 28 +; CHECK-NEXT: v_readlane_b32 s15, v0, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 30 -; CHECK-NEXT: v_readlane_b32 s1, v23, 31 -; CHECK-NEXT: v_readlane_b32 s2, v23, 32 -; CHECK-NEXT: v_readlane_b32 s3, v23, 33 -; CHECK-NEXT: v_readlane_b32 s4, v23, 34 -; CHECK-NEXT: v_readlane_b32 s5, v23, 35 -; CHECK-NEXT: v_readlane_b32 s6, v23, 36 -; CHECK-NEXT: v_readlane_b32 s7, v23, 37 +; CHECK-NEXT: v_readlane_b32 s0, v0, 30 +; CHECK-NEXT: v_readlane_b32 s1, v0, 31 +; CHECK-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-NEXT: v_readlane_b32 s3, v0, 33 +; CHECK-NEXT: v_readlane_b32 s4, v0, 34 +; CHECK-NEXT: v_readlane_b32 s5, v0, 35 +; CHECK-NEXT: v_readlane_b32 s6, v0, 36 +; CHECK-NEXT: v_readlane_b32 s7, v0, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -260,10 +258,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 38 -; CHECK-NEXT: v_readlane_b32 s1, v23, 39 -; CHECK-NEXT: v_readlane_b32 s2, v23, 40 -; CHECK-NEXT: v_readlane_b32 s3, v23, 41 +; CHECK-NEXT: v_readlane_b32 s0, v0, 38 +; CHECK-NEXT: v_readlane_b32 s1, v0, 39 +; CHECK-NEXT: v_readlane_b32 s2, v0, 40 +; CHECK-NEXT: v_readlane_b32 s3, v0, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -276,105 +274,103 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v23, 42 -; CHECK-NEXT: v_readlane_b32 s5, v23, 43 -; CHECK-NEXT: v_readlane_b32 s6, v23, 44 -; CHECK-NEXT: v_readlane_b32 s7, v23, 45 -; CHECK-NEXT: v_readlane_b32 s8, v23, 46 -; CHECK-NEXT: v_readlane_b32 s9, v23, 47 -; CHECK-NEXT: v_readlane_b32 s10, v23, 48 -; CHECK-NEXT: v_readlane_b32 s11, v23, 49 -; CHECK-NEXT: v_readlane_b32 s12, v23, 50 -; CHECK-NEXT: v_readlane_b32 s13, v23, 51 -; CHECK-NEXT: v_readlane_b32 s14, v23, 52 -; CHECK-NEXT: v_readlane_b32 s15, v23, 53 +; CHECK-NEXT: v_readlane_b32 s4, v0, 42 +; CHECK-NEXT: v_readlane_b32 s5, v0, 43 +; CHECK-NEXT: v_readlane_b32 s6, v0, 44 +; CHECK-NEXT: v_readlane_b32 s7, v0, 45 +; CHECK-NEXT: v_readlane_b32 s8, v0, 46 +; CHECK-NEXT: v_readlane_b32 s9, v0, 47 +; CHECK-NEXT: v_readlane_b32 s10, v0, 48 +; CHECK-NEXT: v_readlane_b32 s11, v0, 49 +; CHECK-NEXT: v_readlane_b32 s12, v0, 50 +; CHECK-NEXT: v_readlane_b32 s13, v0, 51 +; CHECK-NEXT: v_readlane_b32 s14, v0, 52 +; CHECK-NEXT: v_readlane_b32 s15, v0, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 54 -; CHECK-NEXT: v_readlane_b32 s1, v23, 55 -; CHECK-NEXT: v_readlane_b32 s2, v23, 56 -; CHECK-NEXT: v_readlane_b32 s3, v23, 57 +; CHECK-NEXT: v_readlane_b32 s0, v0, 54 +; CHECK-NEXT: v_readlane_b32 s1, v0, 55 +; CHECK-NEXT: v_readlane_b32 s2, v0, 56 +; CHECK-NEXT: v_readlane_b32 s3, v0, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 58 -; CHECK-NEXT: v_readlane_b32 s1, v23, 59 -; CHECK-NEXT: v_readlane_b32 s2, v23, 60 -; CHECK-NEXT: v_readlane_b32 s3, v23, 61 -; CHECK-NEXT: v_readlane_b32 s4, v23, 62 -; CHECK-NEXT: v_readlane_b32 s5, v23, 63 -; CHECK-NEXT: v_readlane_b32 s6, v0, 0 -; CHECK-NEXT: v_readlane_b32 s7, v0, 1 +; CHECK-NEXT: v_readlane_b32 s0, v0, 58 +; CHECK-NEXT: v_readlane_b32 s1, v0, 59 +; CHECK-NEXT: v_readlane_b32 s2, v0, 60 +; CHECK-NEXT: v_readlane_b32 s3, v0, 61 +; CHECK-NEXT: v_readlane_b32 s4, v0, 62 +; CHECK-NEXT: v_readlane_b32 s5, v0, 63 +; CHECK-NEXT: v_readlane_b32 s6, v1, 0 +; CHECK-NEXT: v_readlane_b32 s7, v1, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 2 -; CHECK-NEXT: v_readlane_b32 s1, v0, 3 -; CHECK-NEXT: v_readlane_b32 s2, v0, 4 -; CHECK-NEXT: v_readlane_b32 s3, v0, 5 -; CHECK-NEXT: v_readlane_b32 s4, v0, 6 -; CHECK-NEXT: v_readlane_b32 s5, v0, 7 -; CHECK-NEXT: v_readlane_b32 s6, v0, 8 -; CHECK-NEXT: v_readlane_b32 s7, v0, 9 -; CHECK-NEXT: v_readlane_b32 s8, v0, 10 -; CHECK-NEXT: v_readlane_b32 s9, v0, 11 -; CHECK-NEXT: v_readlane_b32 s10, v0, 12 -; CHECK-NEXT: v_readlane_b32 s11, v0, 13 -; CHECK-NEXT: v_readlane_b32 s12, v0, 14 -; CHECK-NEXT: v_readlane_b32 s13, v0, 15 -; CHECK-NEXT: v_readlane_b32 s14, v0, 16 -; CHECK-NEXT: v_readlane_b32 s15, v0, 17 +; CHECK-NEXT: v_readlane_b32 s0, v1, 2 +; CHECK-NEXT: v_readlane_b32 s1, v1, 3 +; CHECK-NEXT: v_readlane_b32 s2, v1, 4 +; CHECK-NEXT: v_readlane_b32 s3, v1, 5 +; CHECK-NEXT: v_readlane_b32 s4, v1, 6 +; CHECK-NEXT: v_readlane_b32 s5, v1, 7 +; CHECK-NEXT: v_readlane_b32 s6, v1, 8 +; CHECK-NEXT: v_readlane_b32 s7, v1, 9 +; CHECK-NEXT: v_readlane_b32 s8, v1, 10 +; CHECK-NEXT: v_readlane_b32 s9, v1, 11 +; CHECK-NEXT: v_readlane_b32 s10, v1, 12 +; CHECK-NEXT: v_readlane_b32 s11, v1, 13 +; CHECK-NEXT: v_readlane_b32 s12, v1, 14 +; CHECK-NEXT: v_readlane_b32 s13, v1, 15 +; CHECK-NEXT: v_readlane_b32 s14, v1, 16 +; CHECK-NEXT: v_readlane_b32 s15, v1, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 18 -; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v1, 18 +; CHECK-NEXT: v_readlane_b32 s1, v1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 20 -; CHECK-NEXT: v_readlane_b32 s1, v0, 21 -; CHECK-NEXT: v_readlane_b32 s2, v0, 22 -; CHECK-NEXT: v_readlane_b32 s3, v0, 23 +; CHECK-NEXT: v_readlane_b32 s0, v1, 20 +; CHECK-NEXT: v_readlane_b32 s1, v1, 21 +; CHECK-NEXT: v_readlane_b32 s2, v1, 22 +; CHECK-NEXT: v_readlane_b32 s3, v1, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 24 -; CHECK-NEXT: v_readlane_b32 s1, v0, 25 -; CHECK-NEXT: v_readlane_b32 s2, v0, 26 -; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: v_readlane_b32 s4, v0, 28 -; CHECK-NEXT: v_readlane_b32 s5, v0, 29 -; CHECK-NEXT: v_readlane_b32 s6, v0, 30 -; CHECK-NEXT: v_readlane_b32 s7, v0, 31 +; CHECK-NEXT: v_readlane_b32 s0, v1, 24 +; CHECK-NEXT: v_readlane_b32 s1, v1, 25 +; CHECK-NEXT: v_readlane_b32 s2, v1, 26 +; CHECK-NEXT: v_readlane_b32 s3, v1, 27 +; CHECK-NEXT: v_readlane_b32 s4, v1, 28 +; CHECK-NEXT: v_readlane_b32 s5, v1, 29 +; CHECK-NEXT: v_readlane_b32 s6, v1, 30 +; CHECK-NEXT: v_readlane_b32 s7, v1, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 -; CHECK-NEXT: v_readlane_b32 s4, v0, 36 -; CHECK-NEXT: v_readlane_b32 s5, v0, 37 -; CHECK-NEXT: v_readlane_b32 s6, v0, 38 -; CHECK-NEXT: v_readlane_b32 s7, v0, 39 -; CHECK-NEXT: v_readlane_b32 s8, v0, 40 -; CHECK-NEXT: v_readlane_b32 s9, v0, 41 -; CHECK-NEXT: v_readlane_b32 s10, v0, 42 -; CHECK-NEXT: v_readlane_b32 s11, v0, 43 -; CHECK-NEXT: v_readlane_b32 s12, v0, 44 -; CHECK-NEXT: v_readlane_b32 s13, v0, 45 -; CHECK-NEXT: v_readlane_b32 s14, v0, 46 -; CHECK-NEXT: v_readlane_b32 s15, v0, 47 +; CHECK-NEXT: v_readlane_b32 s0, v1, 32 +; CHECK-NEXT: v_readlane_b32 s1, v1, 33 +; CHECK-NEXT: v_readlane_b32 s2, v1, 34 +; CHECK-NEXT: v_readlane_b32 s3, v1, 35 +; CHECK-NEXT: v_readlane_b32 s4, v1, 36 +; CHECK-NEXT: v_readlane_b32 s5, v1, 37 +; CHECK-NEXT: v_readlane_b32 s6, v1, 38 +; CHECK-NEXT: v_readlane_b32 s7, v1, 39 +; CHECK-NEXT: v_readlane_b32 s8, v1, 40 +; CHECK-NEXT: v_readlane_b32 s9, v1, 41 +; CHECK-NEXT: v_readlane_b32 s10, v1, 42 +; CHECK-NEXT: v_readlane_b32 s11, v1, 43 +; CHECK-NEXT: v_readlane_b32 s12, v1, 44 +; CHECK-NEXT: v_readlane_b32 s13, v1, 45 +; CHECK-NEXT: v_readlane_b32 s14, v1, 46 +; CHECK-NEXT: v_readlane_b32 s15, v1, 47 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir index a7c841d54f7c9..9fc53cb939fdd 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s @@ -44,28 +45,27 @@ body: | ; SGPR_SPILL: bb.0: ; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000) ; SGPR_SPILL-NEXT: {{ $}} - ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF - ; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0 ; SGPR_SPILL-NEXT: {{ $}} ; SGPR_SPILL-NEXT: bb.1: - ; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[V_WRITELANE_B32_]], 0 - ; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 ; SGPR_SPILL-NEXT: S_ENDPGM 0 + ; ; PEI-LABEL: name: test ; PEI: bb.0: ; PEI-NEXT: successors: %bb.1(0x80000000) ; PEI-NEXT: {{ $}} - ; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF + ; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; PEI-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0 ; PEI-NEXT: {{ $}} ; PEI-NEXT: bb.1: ; PEI-NEXT: liveins: $vgpr0 ; PEI-NEXT: {{ $}} - ; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 - ; PEI-NEXT: KILL killed renamable $vgpr0 + ; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 ; PEI-NEXT: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index d430ba758572d..4efb2822cb1f8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -9,17 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 s0, s0, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -31,87 +23,87 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s8, 0 -; GCN-NEXT: v_writelane_b32 v1, s9, 1 -; GCN-NEXT: v_writelane_b32 v1, s10, 2 -; GCN-NEXT: v_writelane_b32 v1, s11, 3 -; GCN-NEXT: v_writelane_b32 v1, s12, 4 -; GCN-NEXT: v_writelane_b32 v1, s13, 5 -; GCN-NEXT: v_writelane_b32 v1, s14, 6 -; GCN-NEXT: v_writelane_b32 v1, s15, 7 -; GCN-NEXT: v_writelane_b32 v1, s16, 8 -; GCN-NEXT: v_writelane_b32 v1, s17, 9 -; GCN-NEXT: v_writelane_b32 v1, s18, 10 -; GCN-NEXT: v_writelane_b32 v1, s19, 11 -; GCN-NEXT: v_writelane_b32 v1, s20, 12 -; GCN-NEXT: v_writelane_b32 v1, s21, 13 -; GCN-NEXT: v_writelane_b32 v1, s22, 14 -; GCN-NEXT: v_writelane_b32 v1, s23, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: v_writelane_b32 v0, s8, 0 +; GCN-NEXT: v_writelane_b32 v0, s9, 1 +; GCN-NEXT: v_writelane_b32 v0, s10, 2 +; GCN-NEXT: v_writelane_b32 v0, s11, 3 +; GCN-NEXT: v_writelane_b32 v0, s12, 4 +; GCN-NEXT: v_writelane_b32 v0, s13, 5 +; GCN-NEXT: v_writelane_b32 v0, s14, 6 +; GCN-NEXT: v_writelane_b32 v0, s15, 7 +; GCN-NEXT: v_writelane_b32 v0, s16, 8 +; GCN-NEXT: v_writelane_b32 v0, s17, 9 +; GCN-NEXT: v_writelane_b32 v0, s18, 10 +; GCN-NEXT: v_writelane_b32 v0, s19, 11 +; GCN-NEXT: v_writelane_b32 v0, s20, 12 +; GCN-NEXT: v_writelane_b32 v0, s21, 13 +; GCN-NEXT: v_writelane_b32 v0, s22, 14 +; GCN-NEXT: v_writelane_b32 v0, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 16 -; GCN-NEXT: v_writelane_b32 v1, s9, 17 -; GCN-NEXT: v_writelane_b32 v1, s10, 18 -; GCN-NEXT: v_writelane_b32 v1, s11, 19 -; GCN-NEXT: v_writelane_b32 v1, s12, 20 -; GCN-NEXT: v_writelane_b32 v1, s13, 21 -; GCN-NEXT: v_writelane_b32 v1, s14, 22 -; GCN-NEXT: v_writelane_b32 v1, s15, 23 -; GCN-NEXT: v_writelane_b32 v1, s16, 24 -; GCN-NEXT: v_writelane_b32 v1, s17, 25 -; GCN-NEXT: v_writelane_b32 v1, s18, 26 -; GCN-NEXT: v_writelane_b32 v1, s19, 27 -; GCN-NEXT: v_writelane_b32 v1, s20, 28 -; GCN-NEXT: v_writelane_b32 v1, s21, 29 -; GCN-NEXT: v_writelane_b32 v1, s22, 30 -; GCN-NEXT: v_writelane_b32 v1, s23, 31 +; GCN-NEXT: v_writelane_b32 v0, s8, 16 +; GCN-NEXT: v_writelane_b32 v0, s9, 17 +; GCN-NEXT: v_writelane_b32 v0, s10, 18 +; GCN-NEXT: v_writelane_b32 v0, s11, 19 +; GCN-NEXT: v_writelane_b32 v0, s12, 20 +; GCN-NEXT: v_writelane_b32 v0, s13, 21 +; GCN-NEXT: v_writelane_b32 v0, s14, 22 +; GCN-NEXT: v_writelane_b32 v0, s15, 23 +; GCN-NEXT: v_writelane_b32 v0, s16, 24 +; GCN-NEXT: v_writelane_b32 v0, s17, 25 +; GCN-NEXT: v_writelane_b32 v0, s18, 26 +; GCN-NEXT: v_writelane_b32 v0, s19, 27 +; GCN-NEXT: v_writelane_b32 v0, s20, 28 +; GCN-NEXT: v_writelane_b32 v0, s21, 29 +; GCN-NEXT: v_writelane_b32 v0, s22, 30 +; GCN-NEXT: v_writelane_b32 v0, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 32 -; GCN-NEXT: v_writelane_b32 v1, s9, 33 -; GCN-NEXT: v_writelane_b32 v1, s10, 34 -; GCN-NEXT: v_writelane_b32 v1, s11, 35 -; GCN-NEXT: v_writelane_b32 v1, s12, 36 -; GCN-NEXT: v_writelane_b32 v1, s13, 37 -; GCN-NEXT: v_writelane_b32 v1, s14, 38 -; GCN-NEXT: v_writelane_b32 v1, s15, 39 -; GCN-NEXT: v_writelane_b32 v1, s16, 40 -; GCN-NEXT: v_writelane_b32 v1, s17, 41 -; GCN-NEXT: v_writelane_b32 v1, s18, 42 -; GCN-NEXT: v_writelane_b32 v1, s19, 43 -; GCN-NEXT: v_writelane_b32 v1, s20, 44 -; GCN-NEXT: v_writelane_b32 v1, s21, 45 -; GCN-NEXT: v_writelane_b32 v1, s22, 46 -; GCN-NEXT: v_writelane_b32 v1, s23, 47 +; GCN-NEXT: v_writelane_b32 v0, s8, 32 +; GCN-NEXT: v_writelane_b32 v0, s9, 33 +; GCN-NEXT: v_writelane_b32 v0, s10, 34 +; GCN-NEXT: v_writelane_b32 v0, s11, 35 +; GCN-NEXT: v_writelane_b32 v0, s12, 36 +; GCN-NEXT: v_writelane_b32 v0, s13, 37 +; GCN-NEXT: v_writelane_b32 v0, s14, 38 +; GCN-NEXT: v_writelane_b32 v0, s15, 39 +; GCN-NEXT: v_writelane_b32 v0, s16, 40 +; GCN-NEXT: v_writelane_b32 v0, s17, 41 +; GCN-NEXT: v_writelane_b32 v0, s18, 42 +; GCN-NEXT: v_writelane_b32 v0, s19, 43 +; GCN-NEXT: v_writelane_b32 v0, s20, 44 +; GCN-NEXT: v_writelane_b32 v0, s21, 45 +; GCN-NEXT: v_writelane_b32 v0, s22, 46 +; GCN-NEXT: v_writelane_b32 v0, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 48 -; GCN-NEXT: v_writelane_b32 v1, s9, 49 -; GCN-NEXT: v_writelane_b32 v1, s10, 50 -; GCN-NEXT: v_writelane_b32 v1, s11, 51 -; GCN-NEXT: v_writelane_b32 v1, s12, 52 -; GCN-NEXT: v_writelane_b32 v1, s13, 53 -; GCN-NEXT: v_writelane_b32 v1, s14, 54 -; GCN-NEXT: v_writelane_b32 v1, s15, 55 -; GCN-NEXT: v_writelane_b32 v1, s16, 56 -; GCN-NEXT: v_writelane_b32 v1, s17, 57 -; GCN-NEXT: v_writelane_b32 v1, s18, 58 -; GCN-NEXT: v_writelane_b32 v1, s19, 59 -; GCN-NEXT: v_writelane_b32 v1, s20, 60 -; GCN-NEXT: v_writelane_b32 v1, s21, 61 -; GCN-NEXT: v_writelane_b32 v1, s22, 62 -; GCN-NEXT: v_writelane_b32 v1, s23, 63 +; GCN-NEXT: v_writelane_b32 v0, s8, 48 +; GCN-NEXT: v_writelane_b32 v0, s9, 49 +; GCN-NEXT: v_writelane_b32 v0, s10, 50 +; GCN-NEXT: v_writelane_b32 v0, s11, 51 +; GCN-NEXT: v_writelane_b32 v0, s12, 52 +; GCN-NEXT: v_writelane_b32 v0, s13, 53 +; GCN-NEXT: v_writelane_b32 v0, s14, 54 +; GCN-NEXT: v_writelane_b32 v0, s15, 55 +; GCN-NEXT: v_writelane_b32 v0, s16, 56 +; GCN-NEXT: v_writelane_b32 v0, s17, 57 +; GCN-NEXT: v_writelane_b32 v0, s18, 58 +; GCN-NEXT: v_writelane_b32 v0, s19, 59 +; GCN-NEXT: v_writelane_b32 v0, s20, 60 +; GCN-NEXT: v_writelane_b32 v0, s21, 61 +; GCN-NEXT: v_writelane_b32 v0, s22, 62 +; GCN-NEXT: v_writelane_b32 v0, s23, 63 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 @@ -212,14 +204,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou ; GCN-NEXT: ; use s[4:5] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 6a2532147f886..6554b68c0a4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -41,7 +41,6 @@ body: | ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5 ; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc - ; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3 ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3 ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3 @@ -111,6 +110,7 @@ body: | ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5 ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2 ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc ; GCN-NEXT: {{ $}} @@ -124,7 +124,7 @@ body: | ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR killed $vgpr2, 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 @@ -197,7 +197,6 @@ body: | ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 - ; GCN-NEXT: KILL killed renamable $vgpr2 ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4 ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir index 774785fb3966f..1f2e66960eed8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -22,9 +22,9 @@ body: | ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 ; CHECK: liveins: $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -48,9 +48,9 @@ body: | ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 ; CHECK: liveins: $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 764f4942cbd03..cb2a522059213 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -717,19 +717,15 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill @@ -745,10 +741,6 @@ define void @spill_sgpr_with_sgpr_uses() #0 { ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index c5a5a5209f54f..2624ce850a378 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -14,12 +14,11 @@ ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 -; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 ; SGPR-NEXT: s_or_saveexec_b64 s[100:101], -1 ; SGPR-NEXT: s_mov_b64 exec, s[100:101] -; SGPR-NEXT: s_nop 2 +; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 +; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[{{[0-9]+}}:[[HI]]], 0 -; SGPR-NEXT: ; kill: killed $vgpr1 ; ALL: s_endpgm define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index b045dd559aac2..10a910ccb2e70 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -3,19 +3,19 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 ; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir index 05e1082de4478..db5935202c520 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -25,14 +25,13 @@ body: | ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: dead renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 - ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -64,13 +63,12 @@ body: | ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0 ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: dead renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index 11babc82e919b..6ec1f2b1edf24 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -21,9 +21,9 @@ body: | ; GCN-LABEL: name: sgpr32_spill ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 S_NOP 0 @@ -55,7 +55,6 @@ body: | ; GCN-LABEL: name: sgpr_spill_lane_crossover ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63 @@ -89,39 +88,40 @@ body: | ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63 ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr65, 2, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr66, 3, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr67, 4, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr68, 5, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr69, 6, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr70, 7, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr71, 8, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr72, 9, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr73, 10, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr74, 11, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr75, 12, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr76, 13, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr77, 14, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr78, 15, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr79, 16, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr80, 17, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr81, 18, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr82, 19, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr83, 20, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr84, 21, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr85, 22, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr86, 23, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr87, 24, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr88, 25, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr89, 26, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr90, 27, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr91, 28, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr92, 29, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr93, 30, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr94, 31, [[DEF]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 32, [[DEF]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr65, 2, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr66, 3, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr67, 4, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr68, 5, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr69, 6, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr70, 7, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr71, 8, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr72, 9, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr73, 10, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr74, 11, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr75, 12, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr76, 13, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr77, 14, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr78, 15, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr79, 16, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr80, 17, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr81, 18, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr82, 19, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr83, 20, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr84, 21, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr85, 22, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr86, 23, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr87, 24, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr88, 25, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr89, 26, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr90, 27, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr91, 28, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr92, 29, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr93, 30, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr94, 31, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 32, [[DEF]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2 @@ -187,9 +187,9 @@ body: | ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: @@ -197,7 +197,7 @@ body: | ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: @@ -205,7 +205,7 @@ body: | ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: @@ -256,7 +256,6 @@ body: | ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} @@ -264,7 +263,7 @@ body: | ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0 ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} @@ -272,7 +271,7 @@ body: | ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR %0, 0 ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} @@ -281,7 +280,8 @@ body: | ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc ; GCN-NEXT: S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index d5f97314f9324..b0caa946d79f3 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3604489 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index 81dd2c4457b2f..b8e928021ca8d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -13,7 +13,6 @@ define void @test() { ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: .LBB0_1: ; %bb.1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 @@ -21,15 +20,13 @@ define void @test() { ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: .LBB0_3: ; %bb.3 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: s_cmp_eq_u32 s6, s7 +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v0, s4, 0 ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: s_mov_b64 s[10:11], exec @@ -64,10 +61,6 @@ define void @test() { ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.6: ; %bb.5 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir index 56882cd83106e..54fa09833b6d3 100644 --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -34,14 +34,14 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 5, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 5, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir index 4525fc1cb70e6..36bb70089de1a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -32,15 +32,15 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 6, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 6, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir index 173056e6b9132..811067319069e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -32,17 +32,17 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 8, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 8, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir index 828f524bb0677..3ebdd60280636 100644 --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -32,18 +32,18 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr13, 9, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr13, 9, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir index ef620fae5b104..275c725c0eae5 100644 --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -32,19 +32,19 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr14, 10, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr14, 10, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir index 37cf5d0fc83cf..6df5bb372117c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -32,20 +32,20 @@ body: | ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, [[DEF]] - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr15, 11, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, [[DEF]] + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:wwm_vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr15, 11, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index b8bc01e0b879b..0ce19a74fa922 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -238,15 +238,10 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; WAVE32-O0-NEXT: v_mov_b32_e32 v1, v0 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 -; WAVE32-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v1, 1 +; WAVE32-O0-NEXT: v_and_b32_e64 v0, 1, v0 +; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 ; WAVE32-O0-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) +; WAVE32-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: v_writelane_b32 v0, s4, 0 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill @@ -267,7 +262,6 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 ; WAVE32-O0-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 @@ -280,15 +274,10 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; WAVE64-O0-NEXT: v_mov_b32_e32 v1, v0 -; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] -; WAVE64-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, 1 +; WAVE64-O0-NEXT: v_and_b32_e64 v0, 1, v0 +; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, 1 ; WAVE64-O0-NEXT: s_mov_b64 s[4:5], exec -; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) +; WAVE64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; WAVE64-O0-NEXT: v_writelane_b32 v0, s5, 1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 @@ -311,7 +300,6 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; WAVE64-O0-NEXT: v_readlane_b32 s5, v0, 1 ; WAVE64-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -324,10 +312,10 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_and_b32_e64 v0, 1, v0 ; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, exec_lo +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s4, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s5 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 @@ -341,7 +329,6 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr1 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 @@ -923,7 +910,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21 ; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9 ; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 ; WAVE32-O0-NEXT: s_mov_b32 s12, s6 @@ -931,6 +917,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE32-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-O0-NEXT: s_mov_b32 s0, s32 +; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 @@ -1030,7 +1017,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: ; use s1 ; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_mov_b32 s32, s0 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_endpgm ; ; WAVE64-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: @@ -1042,7 +1028,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-O0-NEXT: s_add_u32 s24, s24, s9 ; WAVE64-O0-NEXT: s_addc_u32 s25, s25, 0 -; WAVE64-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: s_mov_b32 s14, s8 ; WAVE64-O0-NEXT: s_mov_b32 s13, s7 ; WAVE64-O0-NEXT: s_mov_b32 s12, s6 @@ -1050,6 +1035,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE64-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE64-O0-NEXT: s_mov_b32 s0, s32 +; WAVE64-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE64-O0-NEXT: s_lshr_b32 s0, s0, 6 ; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 1 @@ -1149,7 +1135,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: ; use s1 ; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_mov_b32 s32, s0 -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_endpgm ; ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: @@ -1162,7 +1147,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21 ; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9 ; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s12, s6 @@ -1170,6 +1154,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[8:9], s[2:3] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s0, s32 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 @@ -1262,7 +1247,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: ; use s1 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s0 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr32 ; WAVE32-WWM-PREALLOC-NEXT: s_endpgm %alloca = alloca [32 x i32], addrspace(5) %stacksave = call ptr addrspace(5) @llvm.stacksave.p5() @@ -1354,10 +1338,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s16 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0x1200 -; WAVE32-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE32-O0-NEXT: v_writelane_b32 v32, s31, 1 ; WAVE32-O0-NEXT: s_mov_b32 s16, s32 +; WAVE32-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 @@ -1454,7 +1438,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 ; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0 -; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload @@ -1474,10 +1457,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[16:17] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0x2400 -; WAVE64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: v_writelane_b32 v32, s30, 0 ; WAVE64-O0-NEXT: v_writelane_b32 v32, s31, 1 ; WAVE64-O0-NEXT: s_mov_b32 s16, s32 +; WAVE64-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; WAVE64-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE64-O0-NEXT: s_lshr_b32 s16, s16, 6 ; WAVE64-O0-NEXT: v_writelane_b32 v0, s16, 1 @@ -1574,7 +1557,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 ; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0 -; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload @@ -1594,10 +1576,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s16 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s30, 0 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v33, s31, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, s32 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 1 @@ -1687,7 +1669,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 -; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 20dc5ad5c8665..3904fa41869ff 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -13,22 +13,19 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_add_u32 s0, s0, s13 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane -; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: ds_write_b8 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: ds_write_b8 v0, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v0, s4, 0 ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -65,10 +62,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: %i10 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index f680bbdd05cdd..84e364c658eb1 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -16,13 +16,12 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, -1 ; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: v_writelane_b32 v40, s28, 2 ; GFX90A-NEXT: v_writelane_b32 v40, s29, 3 ; GFX90A-NEXT: v_writelane_b32 v40, s16, 4 -; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX90A-NEXT: s_addk_i32 s32, 0x400 ; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-NEXT: ;;#ASMSTART @@ -30,7 +29,7 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_writelane_b32 v0, s20, 0 ; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[28:29] ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, foo@gotpcrel32@lo+4 @@ -39,15 +38,15 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[28:29] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readlane_b32 s20, v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s20 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 -; GFX90A-NEXT: ; kill: killed $vgpr0 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 ; GFX90A-NEXT: v_readlane_b32 s29, v40, 3 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 @@ -55,7 +54,6 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, -1 ; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 ; GFX90A-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 3a33194f17c87..fcf3582be69e0 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -20,16 +20,15 @@ define void @test() #0 { ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s28, 2 ; GCN-NEXT: v_writelane_b32 v40, s29, 3 ; GCN-NEXT: v_writelane_b32 v40, s16, 4 -; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_addk_i32 s32, 0x800 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s16 @@ -45,26 +44,24 @@ define void @test() #0 { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-NEXT: v_readlane_b32 s29, v40, 3 ; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -83,12 +80,12 @@ define void @test() #0 { ; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3 ; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-O0-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-O0-NEXT: ;;#ASMSTART ; GCN-O0-NEXT: ; def s16 ; GCN-O0-NEXT: ;;#ASMEND +; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v0, s16, 0 ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -110,14 +107,13 @@ define void @test() #0 { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 -; GCN-O0-NEXT: global_store_dword v[1:2], v3, off +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: global_store_dword v[0:1], v2, off ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 ; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 11f6a2960776b..294069eba51a1 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -142,14 +142,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 @@ -164,7 +161,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_mov_b32 s37, s44 ; GFX9-O0-NEXT: s_mov_b32 s38, s43 ; GFX9-O0-NEXT: s_mov_b32 s39, s42 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 ; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 @@ -221,25 +218,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-O0-NEXT: v_readlane_b32 s37, v4, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] -; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 -; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 3 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s38, v4, 0 +; GFX9-O0-NEXT: v_readlane_b32 s39, v4, 1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[36:37] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] ; GFX9-O0-NEXT: s_mov_b32 s36, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s36, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s36 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s36 ; GFX9-O0-NEXT: s_mov_b32 s40, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, s39 @@ -249,12 +246,12 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_mov_b32 s38, s35 ; GFX9-O0-NEXT: s_mov_b32 s39, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -581,7 +578,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 @@ -599,6 +595,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b32 s41, s45 ; GFX9-O0-NEXT: s_mov_b32 s42, s44 ; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 ; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 ; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 @@ -647,9 +644,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 ; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -658,13 +652,12 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index e5cebc1c31832..e00218a3b2275 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -121,12 +121,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 ; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 ; GFX9-O0-NEXT: s_mov_b32 s4, s1 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 0 @@ -198,25 +194,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 5 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 6 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s2, v4, 1 +; GFX9-O0-NEXT: v_readlane_b32 s3, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s0, v4, 3 +; GFX9-O0-NEXT: v_readlane_b32 s1, v4, 4 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s4 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX9-O0-NEXT: s_mov_b32 s6, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; GFX9-O0-NEXT: s_mov_b32 s4, s3 @@ -226,8 +222,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s2, s5 ; GFX9-O0-NEXT: s_mov_b32 s3, s4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: cfg: @@ -328,38 +323,32 @@ define hidden i32 @called(i32 %a) noinline { define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-LABEL: call: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v7, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v7, 1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c @@ -373,23 +362,19 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_mov_b32 s17, s7 ; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -405,36 +390,29 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O0-NEXT: s_mov_b32 s6, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 -; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: call: @@ -564,38 +542,32 @@ define i64 @called_i64(i64 %a) noinline { define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { ; GFX9-O0-LABEL: call_i64: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v12, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v12, 1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c @@ -609,24 +581,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s7 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -636,11 +604,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-O0-NEXT: s_mov_b32 s0, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s0, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12 @@ -660,33 +628,25 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: call_i64: @@ -993,12 +953,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 ; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 0 ; GFX9-O0-NEXT: s_mov_b32 s4, s1 ; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 0 @@ -1070,25 +1026,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB8_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 +; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 5 +; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 6 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 -; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 4 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s2, v4, 1 +; GFX9-O0-NEXT: v_readlane_b32 s3, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s0, v4, 3 +; GFX9-O0-NEXT: v_readlane_b32 s1, v4, 4 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s4 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX9-O0-NEXT: s_mov_b32 s6, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 ; GFX9-O0-NEXT: s_mov_b32 s4, s3 @@ -1098,8 +1054,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s2, s5 ; GFX9-O0-NEXT: s_mov_b32 s3, s4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_cfg: @@ -1200,38 +1155,32 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v7, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v7, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v7, 1 -; GFX9-O0-NEXT: v_writelane_b32 v7, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v7, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v7, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c @@ -1245,23 +1194,19 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_mov_b32 s17, s7 ; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 ; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1277,36 +1222,29 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 +; GFX9-O0-NEXT: s_mov_b32 s6, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 -; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 +; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v6 +; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_call: @@ -1436,38 +1374,32 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: -; GFX9-O0-NEXT: s_mov_b32 s32, 0x400 +; GFX9-O0-NEXT: s_mov_b32 s32, 0 ; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 ; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr12 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v12, s11, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 ; GFX9-O0-NEXT: s_mov_b32 s14, s8 ; GFX9-O0-NEXT: s_mov_b32 s13, s7 ; GFX9-O0-NEXT: s_mov_b32 s12, s6 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v12, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v12, 1 -; GFX9-O0-NEXT: v_writelane_b32 v12, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v12, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O0-NEXT: v_readlane_b32 s0, v12, 2 -; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c @@ -1481,24 +1413,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7 +; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s7 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1508,11 +1436,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-O0-NEXT: s_mov_b32 s0, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s0, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -1532,33 +1460,25 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 +; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_call_i64: diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 06a8a6fa04828..5e53a9d4822b9 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1 -; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<61>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11> +; CHECK-NEXT: t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<61>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11> ; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t49, t32 ; CHECK-NEXT: t23: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23 diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 7de7eabdd1f60..8db39e384dea1 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -17,12 +17,50 @@ public: bool hasFP(const MachineFunction &MF) const override { return false; } }; -static TargetRegisterClass *const BogusRegisterClasses[] = {nullptr}; +// Dummy regclass fields ... +namespace { +enum { + NoRegister, + GPR0 = 1, + NUM_TARGET_REGS +}; + +enum { + GPRsRegClassID = 0, +}; + +const MCPhysReg GPRs[] = {GPR0}; + +const uint8_t GPRsBits[] = { 0x00 }; + +static const uint32_t GPRsSubClassMask[] = { + 0x00000001, +}; +} // namespace + +static const TargetRegisterClass *const NullRegClasses[] = { nullptr }; +static const MCRegisterClass BogusMCRegisterClasses[] = {{ GPRs, GPRsBits, 0, 1, + sizeof(GPRsBits), GPRsRegClassID, 1, -1, false, true ,false }}; +static const TargetRegisterClass GPRsRegClass = { + &BogusMCRegisterClasses[GPRsRegClassID], + GPRsSubClassMask, + 0, + LaneBitmask(0x0000000000000001), + 0, + false, + 0x0, + false, + false, + NullRegClasses, + nullptr +}; + + static const TargetRegisterClass *const BogusRegisterClasses[] = {&GPRsRegClass}; class BogusRegisterInfo : public TargetRegisterInfo { public: BogusRegisterInfo() - : TargetRegisterInfo(nullptr, BogusRegisterClasses, BogusRegisterClasses, + : TargetRegisterInfo(nullptr, BogusRegisterClasses, BogusRegisterClasses+1, nullptr, nullptr, LaneBitmask(~0u), nullptr, nullptr) { InitMCRegisterInfo(nullptr, 0, 0, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr, nullptr, 0, nullptr, nullptr); diff --git a/llvm/unittests/CodeGen/MachineInstrTest.cpp b/llvm/unittests/CodeGen/MachineInstrTest.cpp index 49da0c38eefdc..3bac28324db13 100644 --- a/llvm/unittests/CodeGen/MachineInstrTest.cpp +++ b/llvm/unittests/CodeGen/MachineInstrTest.cpp @@ -530,8 +530,17 @@ TEST(MachineInstrTest, SpliceOperands) { EXPECT_EQ(MI->getOperand(8).getImm(), MachineOperand::CreateImm(4).getImm()); // test tied operands - MCRegisterClass MRC{ - 0, 0, 0, 0, 0, 0, 0, 0, /*Allocatable=*/true, /*BaseClass=*/true}; + MCRegisterClass MRC{0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /*Allocatable=*/true, + /*BaseClass=*/true, + /*Synthetic=*/false}; TargetRegisterClass RC{&MRC, 0, 0, {}, 0, 0, 0, 0, 0, 0, 0}; // MachineRegisterInfo will be very upset if these registers aren't // allocatable. diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp index 40af0d3077b2d..e72ada51d2bc0 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/CodeGenRegisters.cpp @@ -818,6 +818,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) BitInit *Bit = cast(TSF->getBit(I)); TSFlags |= uint8_t(Bit->getValue()) << I; } + + Synthetic = R->getValueAsBit("Synthetic"); } // Create an inferred register class that was missing from the .td files. @@ -828,7 +830,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, : Members(*Props.Members), TheDef(nullptr), Name(std::string(Name)), TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), RSI(Props.RSI), CopyCost(0), Allocatable(true), AllocationPriority(0), - GlobalPriority(false), TSFlags(0) { + GlobalPriority(false), TSFlags(0), Synthetic(false) { Artificial = true; GeneratePressureSet = false; for (const auto R : Members) { @@ -1096,7 +1098,8 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs( for (auto *SuperRegRC : SuperRegRCs) { for (const auto &SuperRegClassPair : SuperRegClasses) { const BitVector &SuperRegClassBV = SuperRegClassPair.second; - if (SuperRegClassBV[SuperRegRC->EnumValue]) { + if (SuperRegClassBV[SuperRegRC->EnumValue] && + !SuperRegClassPair.first->Synthetic) { SubRegRC = SuperRegClassPair.first; ChosenSuperRegClass = SuperRegRC; diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h index c34f376ea99db..d53682417fc54 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.h +++ b/llvm/utils/TableGen/CodeGenRegisters.h @@ -333,6 +333,7 @@ class CodeGenRegisterClass { uint8_t AllocationPriority; bool GlobalPriority; uint8_t TSFlags; + bool Synthetic; /// Contains the combination of the lane masks of all subregisters. LaneBitmask LaneMask; /// True if there are at least 2 subregisters which do not interfere. diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index d074e31c62458..041983de18248 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -1065,7 +1065,8 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, << ", " << RCBitsSize << ", " << RC.getQualifiedIdName() << ", " << RegSize << ", " << RC.CopyCost << ", " << (RC.Allocatable ? "true" : "false") << ", " - << (RC.getBaseClassOrder() ? "true" : "false") << " },\n"; + << (RC.getBaseClassOrder() ? "true" : "false") << " ," + << (RC.Synthetic ? "true" : "false") << " },\n"; } OS << "};\n\n";