diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 27b30bd5929ff..0cfb8454dcd99 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -42,6 +42,7 @@ namespace llvm { class PSetIterator; +class VirtRegMap; /// Convenient type to represent either a register class or a register bank. using RegClassOrRegBank = @@ -107,6 +108,12 @@ class MachineRegisterInfo { VirtReg2IndexFunctor> RegAllocHints; + /// AntiHintRegs - This vector records register anti-hints for + /// virtual registers. For each virtual register, it keeps a vector of virtual + /// registers that should NOT be allocated to the same or overlapping physical + /// registers. + IndexedMap, VirtReg2IndexFunctor> AntiHintRegs; + /// PhysRegUseDefLists - This is an array of the head of the use/def list for /// physical registers. std::unique_ptr PhysRegUseDefLists; @@ -860,6 +867,56 @@ class MachineRegisterInfo { return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr; } + /// addRegAllocAntiHint - Add a register allocation anti-hint for the + /// specified virtual register. This tells the allocator to avoid allocating + /// VReg to the same physical register as AntiHintVReg (or overlapping ones). + void addRegAllocAntiHint(Register VReg, Register AntiHintVReg) { + assert(VReg.isVirtual() && "Anti-hints are only for virtual registers"); + assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual"); + AntiHintRegs.grow(VReg); + SmallVector &AntiHints = AntiHintRegs[VReg]; + // Avoid duplicates + if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end()) + AntiHints.push_back(AntiHintVReg); + } + + /// addRegAllocationAntiHint - Add multiple anti-hints at once. + void addRegAllocationAntiHints(Register VReg, + ArrayRef AntiHintVRegs) { + for (Register AntiHint : AntiHintVRegs) + addRegAllocAntiHint(VReg, AntiHint); + } + + /// clearRegAllocationAntiHints - Clear all anti-hints for a register. + void clearRegAllocationAntiHints(Register VReg) { + assert(VReg.isVirtual()); + if (AntiHintRegs.inBounds(VReg)) + AntiHintRegs[VReg].clear(); + } + + /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg. + ArrayRef getRegAllocationAntiHints(Register VReg) const { + assert(VReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg)) + return ArrayRef(); + return AntiHintRegs[VReg]; + } + + /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint. + bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const { + assert(VReg.isVirtual() && AntiHintVReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg)) + return false; + const SmallVector &AntiHints = AntiHintRegs[VReg]; + return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end(); + } + + /// getPhysRegAntiHints - Get the set of physical registers to avoid. + /// VRM is the current virtual register map showing allocations made so far. + void getPhysRegAntiHints(Register VReg, + SmallVectorImpl &PhysAntiHints, + const VirtRegMap &VRM) const; + /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the /// specified register as undefined which causes the DBG_VALUE to be /// deleted during LiveDebugVariables analysis. diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp index 183dc8af1b91b..32005fd6ff837 100644 --- a/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/llvm/lib/CodeGen/AllocationOrder.cpp @@ -31,6 +31,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, const LiveRegMatrix *Matrix) { const MachineFunction &MF = VRM.getMachineFunction(); const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg)); SmallVector Hints; bool HardHints = @@ -44,8 +45,73 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, dbgs() << '\n'; } }); - assert(all_of(Hints, - [&](MCPhysReg Hint) { return is_contained(Order, Hint); }) && + + // Get anti-hints + SmallVector AntiHintedPhysRegs; + MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, VRM); + + LLVM_DEBUG({ + if (!AntiHintedPhysRegs.empty()) { + dbgs() << "anti-hints:"; + for (MCPhysReg AntiHint : AntiHintedPhysRegs) + dbgs() << ' ' << printReg(AntiHint, TRI); + dbgs() << '\n'; + } + }); + + // Create allocation order object + AllocationOrder AO(std::move(Hints), Order, HardHints); + + // Apply anti-hints filtering if needed + if (!AntiHintedPhysRegs.empty()) { + AO.applyAntiHints(AntiHintedPhysRegs, TRI); + + LLVM_DEBUG({ + if (!AO.Hints.empty()) { + dbgs() << "filtered hints:"; + for (MCPhysReg Hint : AO.Hints) + dbgs() << ' ' << printReg(Hint, TRI); + dbgs() << '\n'; + } + }); + } + + assert(all_of(AO.Hints, + [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) && "Target hint is outside allocation order."); - return AllocationOrder(std::move(Hints), Order, HardHints); + return AO; +} + +void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, + const TargetRegisterInfo *TRI) { + // Helper to check if a register overlaps with any anti-hint + auto isAntiHinted = [&](MCPhysReg Reg) { + return std::any_of( + AntiHintedPhysRegs.begin(), AntiHintedPhysRegs.end(), + [&](MCPhysReg AntiHint) { return TRI->regsOverlap(Reg, AntiHint); }); + }; + + // Create filtered order + FilteredOrderStorage.clear(); + FilteredOrderStorage.assign(Order.begin(), Order.end()); + + // Partition: non-anti-hinted registers go first + auto PartitionPoint = std::stable_partition( + FilteredOrderStorage.begin(), FilteredOrderStorage.end(), + [&](MCPhysReg Reg) { return !isAntiHinted(Reg); }); + + // Update Order + Order = FilteredOrderStorage; + + LLVM_DEBUG({ + size_t NonAntiHintedCount = + std::distance(FilteredOrderStorage.begin(), PartitionPoint); + size_t AntiHintedCount = + std::distance(PartitionPoint, FilteredOrderStorage.end()); + dbgs() << " Added " << NonAntiHintedCount + << " non-anti-hinted registers first\n" + << " Added " << AntiHintedCount + << " anti-hinted registers at the end\n" + << " Anti-hint filtering complete\n"; + }); } diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h index 3dd02c3b14d3a..0b10272731444 100644 --- a/llvm/lib/CodeGen/AllocationOrder.h +++ b/llvm/lib/CodeGen/AllocationOrder.h @@ -20,6 +20,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" namespace llvm { @@ -29,6 +30,9 @@ class LiveRegMatrix; class LLVM_LIBRARY_VISIBILITY AllocationOrder { const SmallVector Hints; + // Used as storage if the Order received in the constructor needs to be + // altered. + SmallVector FilteredOrderStorage; ArrayRef Order; // How far into the Order we can iterate. This is 0 if the AllocationOrder is // constructed with HardHints = true, Order.size() otherwise. While @@ -117,6 +121,10 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder { static_cast(std::numeric_limits::max())); return Reg.isPhysical() && is_contained(Hints, Reg.id()); } + + /// Apply anti-hints to the allocation order. + void applyAntiHints(ArrayRef AntiHintedPhysRegs, + const TargetRegisterInfo *TRI); }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index ae284f3ae2929..1cd74d3561b2b 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -11,15 +11,18 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DebugLoc.h" @@ -674,3 +677,26 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { } return false; } + +void MachineRegisterInfo::getPhysRegAntiHints( + Register VReg, SmallVectorImpl &PhysAntiHints, + const VirtRegMap &VRM) const { + assert(VReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg)) + return; + + const SmallVector &AntiHints = AntiHintRegs[VReg]; + + for (Register AntiHintVReg : AntiHints) { + // Check if the anti-hinted register has been allocated + if (VRM.hasPhys(AntiHintVReg)) { + MCPhysReg PhysReg = VRM.getPhys(AntiHintVReg); + // Add the physical register + PhysAntiHints.push_back(PhysReg); + } + } + + // Remove duplicates + llvm::sort(PhysAntiHints); + PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end()); +} diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..dde4a84d45680 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -37,12 +37,19 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Register.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" +static cl::opt + EnableAntiHintsForMFMARegs("amdgpu-anti-hints-for-mfma", cl::Hidden, + cl::desc("Enable Anti-Hints for " + "MFMA in GCNPreRAOptimizations stage."), + cl::init(true)); + namespace { class GCNPreRAOptimizationsImpl { @@ -247,6 +254,88 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { TRI = ST.getRegisterInfo(); bool Changed = false; + // Add RA anti-hints to reduce MFMA hazard NOPs + if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) { + // Max lookback window for RAW or WAW hazard + constexpr unsigned MaxLookbackWindow = 19; + for (const MachineBasicBlock &MBB : MF) { + SmallVector, 16> RecentMFMAs; + for (const MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + + // Handle MFMA instructions + if (SIInstrInfo::isMFMA(MI)) { + SmallVector MFMARegisters; + // Helper to get named operand + auto collectNamedOperand = [&](AMDGPU::OpName OpName, + const char *OpNameStr) { + const MachineOperand *MO = TII->getNamedOperand(MI, OpName); + if (!MO) { + LLVM_DEBUG(dbgs() << " Named operand " << OpNameStr + << " not found\n"); + return; + } + if (MO->isReg() && MO->getReg().isVirtual()) { + Register Reg = MO->getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + // Only consider VGPRs + if (TRI->hasVGPRs(RC)) + MFMARegisters.push_back(Reg); + LLVM_DEBUG(dbgs() << " Collected " << OpNameStr << " : " + << printReg(Reg, TRI) << "\n"); + } + }; + + // Collect destination and source C registers + collectNamedOperand(AMDGPU::OpName::vdst, "vdst"); // Destination + collectNamedOperand(AMDGPU::OpName::src2, + "src2"); // Matrix C (accumulator) + if (!MFMARegisters.empty()) { + RecentMFMAs.emplace_back(std::move(MFMARegisters)); + // Maintain window + if (RecentMFMAs.size() > MaxLookbackWindow) + RecentMFMAs.erase(RecentMFMAs.begin()); + } + continue; + } + bool ShouldCheckReuse = MI.mayLoad() || MI.mayStore() || MI.isCopy() || + SIInstrInfo::isVALU(MI); + // Skip non-relevant instructions, or skip until at least one MFMA is + // encountered + if (!ShouldCheckReuse || RecentMFMAs.empty()) + continue; + + // Process operands that might reuse MFMA registers + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + + const Register CandidateReg = MO.getReg(); + const TargetRegisterClass *CandidateRC = + MRI->getRegClass(CandidateReg); + + // Only process VGPR registers + if (!TRI->isVGPRClass(CandidateRC)) + continue; + for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) { + const SmallVector &MFMARegs = *It; + for (Register MFMAReg : MFMARegs) { + // Check if MFMA register is dead at current instruction + const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); + const SlotIndex CurrentSlot = + LIS->getInstructionIndex(MI).getRegSlot(); + if (!MFMAInterval.liveAt(CurrentSlot)) { + // Add bi-directional anti-hints + MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg); + MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg); + } + } + } + } + } + } + } for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec326327e..3d9be93573ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -15,9 +15,12 @@ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $vgpr132 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 + ; GCN-NEXT: ; implicit-def: $vgpr115 ; GCN-NEXT: ; implicit-def: $vgpr133 ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -167,46 +170,45 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 + ; GCN-NEXT: v_add_u32_e32 v73, v132, v112 ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v113 + ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v115 ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ; kill: killed $vgpr73 ; GCN-NEXT: ds_read_b128 v[72:75], v94 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $sgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 @@ -411,8 +413,6 @@ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 @@ -440,40 +440,36 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 + ; GCN-NEXT: ; implicit-def: $vgpr96 ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v96 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v112 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v113 ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 + ; GCN-NEXT: v_exp_f32_e32 v165, v57 ; GCN-NEXT: v_exp_f32_e32 v49, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 @@ -499,31 +495,30 @@ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 ; GCN-NEXT: v_exp_f32_e32 v55, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 ; GCN-NEXT: v_exp_f32_e32 v56, v48 ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 + ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 + ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v156, s4, v59, -v134 + ; GCN-NEXT: v_pack_b32_f16 v59, v68, v58 + ; GCN-NEXT: v_pack_b32_f16 v58, v64, v67 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v66 ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] @@ -532,288 +527,287 @@ ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GCN-NEXT: v_fma_f32 v157, s4, v60, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v141, v80 ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 + ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 + ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 + ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95] + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v145, v55 + ; GCN-NEXT: v_exp_f32_e32 v167, v57 + ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GCN-NEXT: v_mul_f32_e32 v168, 0x3fb8aa3b, v157 + ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v148, v56 + ; GCN-NEXT: v_mul_f32_e64 v118, v118, v48 + ; GCN-NEXT: v_mul_f32_e64 v119, v119, v48 + ; GCN-NEXT: v_mul_f32_e64 v120, v120, v48 + ; GCN-NEXT: v_mul_f32_e64 v121, v121, v48 ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v149, v145, v148 + ; GCN-NEXT: v_pack_b32_f16 v148, v140, v144 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v156 + ; GCN-NEXT: v_exp_f32_e32 v168, v168 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v153, v140 + ; GCN-NEXT: ; implicit-def: $vgpr140 + ; GCN-NEXT: v_fma_f32 v164, s4, v61, -v134 + ; GCN-NEXT: v_fma_f32 v166, s4, v62, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v169, v141 ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 + ; GCN-NEXT: v_fma_f32 v57, s4, v35, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79] + ; GCN-NEXT: ds_read_b128 v[142:145], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[156:159], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_fma_f32 v164, s4, v33, -v134 + ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v165 + ; GCN-NEXT: v_exp_f32_e32 v170, v146 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v167 ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111] ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v153 + ; GCN-NEXT: v_pack_b32_f16 v62, v169, v147 ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 + ; GCN-NEXT: v_perm_b32 v147, v131, v129, s8 + ; GCN-NEXT: v_pack_b32_f16 v63, v146, v33 ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v148, v33 + ; GCN-NEXT: v_fma_f32 v152, s4, v36, -v134 + ; GCN-NEXT: v_perm_b32 v36, v162, v160, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v149, v168 + ; GCN-NEXT: v_cvt_f16_f32_e32 v155, v170 + ; GCN-NEXT: v_perm_b32 v146, v163, v161, s8 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[58:61], v140 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_exp_f32_e32 v154, v142 + ; GCN-NEXT: v_perm_b32 v142, v162, v160, s8 + ; GCN-NEXT: v_fma_f32 v160, s4, v38, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v157, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v148 + ; GCN-NEXT: v_fma_f32 v156, s4, v37, -v134 ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v143, v130, v128, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111] ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 + ; GCN-NEXT: v_perm_b32 v32, v163, v161, s5 + ; GCN-NEXT: v_exp_f32_e32 v161, v33 ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v136, v[142:143] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v137, v[32:33] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v150, v38 + ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v149, v155 + ; GCN-NEXT: v_exp_f32_e32 v149, v38 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] + ; GCN-NEXT: ds_write_b64 v138, v[146:147] ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr36 ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79] ; GCN-NEXT: ; implicit-def: $vgpr37 ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_exp_f32_e32 v150, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v156, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v154 + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v157 + ; GCN-NEXT: v_fma_f32 v57, s4, v39, -v134 ; GCN-NEXT: ds_read_b128 v[36:39], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 + ; GCN-NEXT: ds_read_b128 v[128:131], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v161 + ; GCN-NEXT: v_exp_f32_e32 v159, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v149 + ; GCN-NEXT: v_fma_f32 v155, s4, v41, -v134 + ; GCN-NEXT: v_fma_f32 v158, s4, v42, -v134 + ; GCN-NEXT: v_fma_f32 v162, s4, v20, -v134 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v34, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v33, v152 + ; GCN-NEXT: v_exp_f32_e32 v60, v32 ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] + ; GCN-NEXT: v_fma_f32 v57, s4, v43, -v134 + ; GCN-NEXT: v_perm_b32 v20, v142, v62, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 + ; GCN-NEXT: v_exp_f32_e32 v61, v36 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v150 + ; GCN-NEXT: v_fma_f32 v155, s4, v46, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v152, v36 + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v156 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v129, s4, v45, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v159 + ; GCN-NEXT: v_exp_f32_e32 v158, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 + ; GCN-NEXT: v_pack_b32_f16 v32, v37, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v57, v36 ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v61 + ; GCN-NEXT: v_fma_f32 v58, s4, v47, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 + ; GCN-NEXT: ds_read_b128 v[36:39], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v128, v40 + ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v152 + ; GCN-NEXT: ds_read_b128 v[44:47], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95] + ; GCN-NEXT: v_fma_f32 v130, s4, v17, -v134 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v131, v40 + ; GCN-NEXT: v_pack_b32_f16 v40, v59, v129 + ; GCN-NEXT: v_fma_f32 v155, s4, v18, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v158 + ; GCN-NEXT: v_exp_f32_e32 v160, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v57 ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v58 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 + ; GCN-NEXT: v_fma_f32 v58, s4, v19, -v134 + ; GCN-NEXT: v_exp_f32_e32 v129, v17 + ; GCN-NEXT: v_perm_b32 v42, v143, v63, s8 + ; GCN-NEXT: v_perm_b32 v43, v147, v145, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 + ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_exp_f32_e32 v163, v36 + ; GCN-NEXT: v_perm_b32 v36, v142, v62, s8 ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v160 + ; GCN-NEXT: v_perm_b32 v21, v146, v144, s5 + ; GCN-NEXT: v_perm_b32 v37, v146, v144, s8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v131 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[20:21] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: v_perm_b32 v16, v143, v63, s5 + ; GCN-NEXT: v_fma_f32 v63, s4, v22, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v129 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v142, v17 + ; GCN-NEXT: v_perm_b32 v17, v147, v145, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[36:37] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v58 ; GCN-NEXT: v_exp_f32_e32 v144, v22 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -836,22 +830,22 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 + ; GCN-NEXT: v_pack_b32_f16 v32, v59, v44 ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v162 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] ; GCN-NEXT: v_exp_f32_e32 v132, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v163 + ; GCN-NEXT: v_fma_f32 v143, s4, v23, -v134 ; GCN-NEXT: ds_read_b128 v[20:23], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -860,20 +854,20 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v26, -v134 + ; GCN-NEXT: v_fma_f32 v147, s4, v4, -v134 ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v142 ; GCN-NEXT: v_exp_f32_e32 v145, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v143 ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 ; GCN-NEXT: v_exp_f32_e32 v35, v16 ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 @@ -895,11 +889,11 @@ ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 + ; GCN-NEXT: v_exp_f32_e32 v143, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v30, -v134 ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] @@ -907,25 +901,25 @@ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 + ; GCN-NEXT: ds_read_b128 v[20:23], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v36, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[28:31], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v39, v24 ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v2, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v143 + ; GCN-NEXT: v_exp_f32_e32 v146, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 @@ -933,25 +927,25 @@ ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 ; GCN-NEXT: v_exp_f32_e32 v34, v1 ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 + ; GCN-NEXT: v_perm_b32 v27, v59, v45, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 + ; GCN-NEXT: v_exp_f32_e32 v155, v20 ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v146 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v38, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 + ; GCN-NEXT: v_perm_b32 v5, v58, v44, s5 + ; GCN-NEXT: v_perm_b32 v21, v58, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND @@ -961,9 +955,9 @@ ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: v_perm_b32 v1, v59, v45, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[20:21] @@ -987,10 +981,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v147 ; GCN-NEXT: v_exp_f32_e32 v26, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v155 ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1042,10 +1036,10 @@ ; GCN-NEXT: v_exp_f32_e32 v21, v9 ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 + ; GCN-NEXT: ds_read_b128 v[4:7], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[12:15], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -1071,33 +1065,33 @@ ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v168, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v170, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v149, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v60, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v61, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v57, v3 ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v131, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 @@ -1105,14 +1099,14 @@ ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 @@ -1137,7 +1131,7 @@ ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 @@ -1147,7 +1141,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..be97a1e82fcf2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -10,25 +10,24 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s20, v2 ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr64 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr76 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; implicit-def: $vgpr49 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr50 ; GCN-NEXT: ; implicit-def: $vgpr77 ; GCN-NEXT: ; implicit-def: $vgpr78 ; GCN-NEXT: ; implicit-def: $vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 + ; GCN-NEXT: ; implicit-def: $vgpr81 + ; GCN-NEXT: ; implicit-def: $vgpr103 ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 + ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v64 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -36,8 +35,9 @@ ; GCN-NEXT: s_lshl_b32 s4, s20, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 + ; GCN-NEXT: v_add_u32_e32 v77, s20, v77 + ; GCN-NEXT: v_and_b32_e32 v77, 0x1fffffff, v77 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b128 v48, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,8 +48,8 @@ ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v76 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v76 ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -68,22 +68,22 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_mul_lo_u32 v77, v77, s6 + ; GCN-NEXT: v_add_lshl_u32 v77, v78, v77, 1 ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 + ; GCN-NEXT: v_lshl_add_u32 v80, v81, 1, v79 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: ds_read_b128 v[36:39], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[44:47], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 @@ -107,20 +107,20 @@ ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: ds_read_b128 v[68:71], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 + ; GCN-NEXT: v_add_u32_e32 v83, v32, v76 + ; GCN-NEXT: v_add_u32_e32 v76, v33, v76 ; GCN-NEXT: ; kill: killed $vgpr83 + ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[66:69], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -131,20 +131,20 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 + ; GCN-NEXT: v_max_f32_e32 v82, v67, v67 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v77, v70 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v78, v71 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 + ; GCN-NEXT: ds_write_b32 v79, v72 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 @@ -152,11 +152,11 @@ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v21 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 @@ -166,12 +166,12 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v30 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 + ; GCN-NEXT: v_max3_f32 v64, v64, v81, v84 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 @@ -179,315 +179,315 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v7 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v80, v68 + ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $vgpr87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: v_max_f32_e32 v65, v82, v64 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v65 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v65 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v65 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v65 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v65 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v65 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v65 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v16 + ; GCN-NEXT: v_exp_f32_e32 v73, v17 + ; GCN-NEXT: v_exp_f32_e32 v81, v18 + ; GCN-NEXT: v_exp_f32_e32 v82, v19 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v72 + ; GCN-NEXT: v_fma_f32 v17, s4, v24, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v73 + ; GCN-NEXT: v_fma_f32 v19, s4, v25, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v81 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v65 ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v82 + ; GCN-NEXT: v_pack_b32_f16 v24, v16, v18 + ; GCN-NEXT: v_sub_f32_e32 v22, v67, v65 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_pack_b32_f16 v25, v20, v21 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v19 + ; GCN-NEXT: ds_read_b128 v[16:19], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v65 + ; GCN-NEXT: v_exp_f32_e32 v86, v23 + ; GCN-NEXT: v_exp_f32_e32 v64, v22 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v16, 0, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v83 + ; GCN-NEXT: v_fma_f32 v88, s4, v28, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v84 + ; GCN-NEXT: v_fma_f32 v91, s4, v29, -v65 + ; GCN-NEXT: v_exp_f32_e32 v92, v21 + ; GCN-NEXT: ds_read_b128 v[20:23], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_perm_b32 v99, v70, v68, s2 + ; GCN-NEXT: v_perm_b32 v100, v70, v68, s3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v93, v73, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v94, v85 + ; GCN-NEXT: v_fma_f32 v95, s4, v30, -v65 + ; GCN-NEXT: v_exp_f32_e32 v96, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v86 + ; GCN-NEXT: v_fma_f32 v98, s4, v31, -v65 + ; GCN-NEXT: v_perm_b32 v101, v71, v69, s2 + ; GCN-NEXT: v_perm_b32 v102, v71, v69, s3 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v77, v99 + ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_pack_b32_f16 v76, v76, v90 + ; GCN-NEXT: v_pack_b32_f16 v77, v94, v97 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v78, v100 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v79, v101 + ; GCN-NEXT: v_mul_f32_e32 v78, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_mul_f32_e32 v79, 0x3fb8aa3b, v91 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v81, v81, v93 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v89 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v65 + ; GCN-NEXT: v_exp_f32_e32 v91, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v92 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v65 + ; GCN-NEXT: v_exp_f32_e32 v93, v79 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 + ; GCN-NEXT: ds_write_b32 v80, v102 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v95 + ; GCN-NEXT: v_add_f32_e32 v76, v82, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v96 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v65 + ; GCN-NEXT: v_exp_f32_e32 v80, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v67 + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v98 + ; GCN-NEXT: v_fma_f32 v81, s4, v3, -v65 + ; GCN-NEXT: v_exp_f32_e32 v82, v88 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_pack_b32_f16 v0, v90, v78 + ; GCN-NEXT: v_pack_b32_f16 v1, v77, v79 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v83, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 + ; GCN-NEXT: v_fma_f32 v83, s4, v4, -v65 + ; GCN-NEXT: v_exp_f32_e32 v90, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v93 + ; GCN-NEXT: v_fma_f32 v94, s4, v5, -v65 + ; GCN-NEXT: v_exp_f32_e32 v88, v88 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v68, v84, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v80 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v82 + ; GCN-NEXT: v_pack_b32_f16 v4, v69, v4 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v81 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[0:3], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v73 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v65 + ; GCN-NEXT: v_exp_f32_e32 v73, v69 + ; GCN-NEXT: ds_read_b128 v[76:79], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v85, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v90 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v65 + ; GCN-NEXT: v_exp_f32_e32 v71, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v88 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v65 + ; GCN-NEXT: v_exp_f32_e32 v81, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_add_f32_e32 v68, v86, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v72 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v65 + ; GCN-NEXT: v_exp_f32_e32 v74, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v73 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_fma_f32 v75, s4, v11, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v7 + ; GCN-NEXT: v_pack_b32_f16 v4, v70, v69 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v6 + ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v89, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v71 + ; GCN-NEXT: v_fma_f32 v70, s4, v12, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v81 + ; GCN-NEXT: v_fma_f32 v86, s4, v13, -v65 + ; GCN-NEXT: v_exp_f32_e32 v87, v8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v76, v92, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v74 + ; GCN-NEXT: v_fma_f32 v77, s4, v14, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v83 + ; GCN-NEXT: v_pack_b32_f16 v68, v68, v85 + ; GCN-NEXT: v_mul_f32_e32 v75, 0x3fb8aa3b, v75 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70 + ; GCN-NEXT: v_pack_b32_f16 v69, v69, v92 + ; GCN-NEXT: v_fma_f32 v65, s4, v15, -v65 + ; GCN-NEXT: v_exp_f32_e32 v75, v75 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v76, v96, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v84 + ; GCN-NEXT: v_exp_f32_e32 v92, v70 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v86 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v87 + ; GCN-NEXT: v_exp_f32_e32 v94, v70 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v67, v76 + ; GCN-NEXT: v_add_f32_e32 v67, v91, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v93, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v80, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v82, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v90, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v88, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v72, v67 + ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v77 + ; GCN-NEXT: v_add_f32_e32 v67, v73, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v89 + ; GCN-NEXT: v_exp_f32_e32 v78, v68 + ; GCN-NEXT: v_add_f32_e32 v67, v71, v67 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v75 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_add_f32_e32 v67, v81, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v74, v67 + ; GCN-NEXT: v_pack_b32_f16 v77, v76, v77 + ; GCN-NEXT: v_pack_b32_f16 v76, v85, v86 + ; GCN-NEXT: v_add_f32_e32 v67, v83, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v74, v92 + ; GCN-NEXT: v_add_f32_e32 v67, v84, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v87, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v89, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v75, v67 + ; GCN-NEXT: v_pack_b32_f16 v69, v68, v72 + ; GCN-NEXT: v_pack_b32_f16 v68, v74, v73 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v92, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v94, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v78, v67 + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v67, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v66, v66, v65 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mov_b32_e32 v67, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_cndmask_b32_e64 v65, v66, v65, s[0:1] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63] + ; GCN-NEXT: v_fmac_f32_e32 v65, v67, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir new file mode 100644 index 0000000000000..ba89b09539113 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir @@ -0,0 +1,239 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-anti-hints-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT + +--- | + target triple = "amdgcn-amd-amdhsa" + + define amdgpu_kernel void @test_software_pipelining() #0 { + bb.0: + ret void + } + + attributes #0 = {nounwind "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" "frame-pointer"="none"} + +... +--- +name: test_software_pipelining +body: | + bb.0: + ; CHECK-LABEL: name: test_software_pipelining + ; CHECK: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr72 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr68 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr73 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr74 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr74, implicit $exec + ; CHECK-NEXT: renamable $vgpr75 = V_ADD_U32_e32 $vgpr68, killed $vgpr52, implicit $exec + ; CHECK-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr75, implicit $exec + ; CHECK-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr69 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr68 = V_ADD_U32_e32 killed $vgpr68, killed $vgpr69, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr74, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr75, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr72, killed renamable $vgpr68_vgpr69_vgpr70_vgpr71, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr72, killed renamable $vgpr48_vgpr49_vgpr50_vgpr51, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 killed renamable $vgpr73, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr64_vgpr65, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr74, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; + ; CHECK-NO-ANTIHINT-LABEL: name: test_software_pipelining + ; CHECK-NO-ANTIHINT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr68 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr69 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr70 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr71 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr71, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr72 = V_ADD_U32_e32 $vgpr69, killed $vgpr52, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr72, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr70, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr70, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36 = V_ADD_U32_e32 killed $vgpr69, killed $vgpr36, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr71, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr70, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr72, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr70, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr68, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = DS_READ_B128_gfx9 renamable $vgpr70, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr68, killed renamable $vgpr14_vgpr15_vgpr16_vgpr17, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 killed renamable $vgpr70, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr12_vgpr13, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr71, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = IMPLICIT_DEF + %5:sgpr_128 = IMPLICIT_DEF + %6:sgpr_128 = IMPLICIT_DEF + %7:vgpr_32 = IMPLICIT_DEF + %8:vreg_128_align2 = IMPLICIT_DEF + %9:vreg_128_align2 = IMPLICIT_DEF + %10:vreg_128_align2 = IMPLICIT_DEF + %11:vreg_128_align2 = IMPLICIT_DEF + %12:vreg_128_align2 = IMPLICIT_DEF + %13:vreg_128_align2 = IMPLICIT_DEF + %14:vreg_128_align2 = IMPLICIT_DEF + %15:vreg_128_align2 = IMPLICIT_DEF + %16:vreg_128_align2 = IMPLICIT_DEF + %17:vreg_128_align2 = IMPLICIT_DEF + %18:vreg_128_align2 = IMPLICIT_DEF + %19:vreg_128_align2 = IMPLICIT_DEF + %20:vreg_128_align2 = IMPLICIT_DEF + %21:vreg_128_align2 = IMPLICIT_DEF + %22:vreg_128_align2 = IMPLICIT_DEF + %23:vreg_128_align2 = IMPLICIT_DEF + %24:vgpr_32 = IMPLICIT_DEF + %25:vgpr_32 = V_ADD_U32_e32 4096, %24, implicit $exec + %26:vgpr_32 = V_ADD_U32_e32 %3, %7, implicit $exec + %27:vgpr_32 = V_ADD_U32_e32 4096, %26, implicit $exec + %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %30:vreg_128_align2 = IMPLICIT_DEF + %31:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %29.sub0_sub1, %23, 0, 0, 0, implicit $mode, implicit $exec + %32:vreg_128_align2 = DS_READ_B128_gfx9 %4, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %29.sub2_sub3, %31, 0, 0, 0, implicit $mode, implicit $exec + %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %28.sub0_sub1, %22, 0, 0, 0, implicit $mode, implicit $exec + %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %28.sub2_sub3, %34, 0, 0, 0, implicit $mode, implicit $exec + %36:vreg_128_align2 = IMPLICIT_DEF + %37:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %29.sub0_sub1, %21, 0, 0, 0, implicit $mode, implicit $exec + %38:vreg_128_align2 = DS_READ_B128_gfx9 %4, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %29.sub2_sub3, %37, 0, 0, 0, implicit $mode, implicit $exec + %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %28.sub0_sub1, %20, 0, 0, 0, implicit $mode, implicit $exec + %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %28.sub2_sub3, %40, 0, 0, 0, implicit $mode, implicit $exec + %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %29.sub0_sub1, %19, 0, 0, 0, implicit $mode, implicit $exec + %43:vgpr_32 = IMPLICIT_DEF + %44:vgpr_32 = V_ADD_U32_e32 %3, %43, implicit $exec + %45:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %29.sub2_sub3, %42, 0, 0, 0, implicit $mode, implicit $exec + %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %28.sub0_sub1, %18, 0, 0, 0, implicit $mode, implicit $exec + %48:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %28.sub2_sub3, %47, 0, 0, 0, implicit $mode, implicit $exec + %49:vreg_128_align2 = DS_READ_B128_gfx9 %4, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + %50:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %29.sub0_sub1, %17, 0, 0, 0, implicit $mode, implicit $exec + %51:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %29.sub2_sub3, %50, 0, 0, 0, implicit $mode, implicit $exec + %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %28.sub0_sub1, %16, 0, 0, 0, implicit $mode, implicit $exec + %54:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %28.sub2_sub3, %53, 0, 0, 0, implicit $mode, implicit $exec + %55:vreg_128_align2 = DS_READ_B128_gfx9 %4, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + %56:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %29.sub0_sub1, %15, 0, 0, 0, implicit $mode, implicit $exec + %57:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1, %57, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %29.sub2_sub3, %56, 0, 0, 0, implicit $mode, implicit $exec + %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %28.sub0_sub1, %14, 0, 0, 0, implicit $mode, implicit $exec + %60:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %28.sub2_sub3, %59, 0, 0, 0, implicit $mode, implicit $exec + %61:vreg_128_align2 = DS_READ_B128_gfx9 %4, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + %62:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %29.sub0_sub1, %13, 0, 0, 0, implicit $mode, implicit $exec + %63:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1, %63, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %29.sub2_sub3, %62, 0, 0, 0, implicit $mode, implicit $exec + %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %28.sub0_sub1, %12, 0, 0, 0, implicit $mode, implicit $exec + %66:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %28.sub2_sub3, %65, 0, 0, 0, implicit $mode, implicit $exec + %67:vreg_128_align2 = DS_READ_B128_gfx9 %4, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + %68:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %61.sub0_sub1, %29.sub0_sub1, %11, 0, 0, 0, implicit $mode, implicit $exec + %69:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 22bc62acce15d..16ea95437881b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 4 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -647,37 +647,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 10 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -731,10 +731,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0 ; GFX90A-VGPR-NEXT: s_nop 3 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64: @@ -747,10 +747,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) @@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: @@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0) @@ -1627,26 +1627,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm: @@ -1655,26 +1655,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -1741,26 +1741,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit: @@ -1769,26 +1769,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 13a96cfa6e650..ceeb00ba55197 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -269,26 +269,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: v_mov_b32_e32 v35, s23 ; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v32, s16 +; GCN-NEXT: v_mov_b32_e32 v33, s17 +; GCN-NEXT: v_mov_b32_e32 v34, s18 +; GCN-NEXT: v_mov_b32_e32 v35, s19 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s12 +; GCN-NEXT: v_mov_b32_e32 v33, s13 +; GCN-NEXT: v_mov_b32_e32 v34, s14 +; GCN-NEXT: v_mov_b32_e32 v35, s15 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s8 +; GCN-NEXT: v_mov_b32_e32 v33, s9 +; GCN-NEXT: v_mov_b32_e32 v34, s10 +; GCN-NEXT: v_mov_b32_e32 v35, s11 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -332,26 +332,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: v_mov_b32_e32 v35, s23 ; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v32, s16 +; GCN-NEXT: v_mov_b32_e32 v33, s17 +; GCN-NEXT: v_mov_b32_e32 v34, s18 +; GCN-NEXT: v_mov_b32_e32 v35, s19 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s12 +; GCN-NEXT: v_mov_b32_e32 v33, s13 +; GCN-NEXT: v_mov_b32_e32 v34, s14 +; GCN-NEXT: v_mov_b32_e32 v35, s15 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s8 +; GCN-NEXT: v_mov_b32_e32 v33, s9 +; GCN-NEXT: v_mov_b32_e32 v34, s10 +; GCN-NEXT: v_mov_b32_e32 v35, s11 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..3646d81ed435b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,18 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -179,18 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -198,18 +198,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -260,18 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -298,18 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -317,18 +317,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -1506,26 +1506,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1609,26 +1609,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -1666,26 +1666,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 2 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -1848,26 +1848,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1951,26 +1951,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -2008,26 +2008,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 2 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -3182,18 +3182,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] -; VGPRRC-NEXT: s_nop 11 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: s_nop 9 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3214,14 +3212,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8: @@ -3594,18 +3592,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1 -; VGPRRC-NEXT: s_nop 11 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: s_nop 9 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3626,14 +3622,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: @@ -4146,33 +4142,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4256,33 +4251,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4320,33 +4314,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -4523,33 +4516,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4633,33 +4625,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4697,33 +4688,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -5421,18 +5411,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5440,18 +5430,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5459,18 +5449,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -5521,18 +5511,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5540,18 +5530,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5559,18 +5549,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af96bb8b9..aa670dce4e6f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1155,17 +1155,17 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 9 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg @@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942-VGPR: ; %bb.0: -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3) @@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 @@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[62:63], v[30:31] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v64, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[60:61], v[28:29] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[58:59], v[26:27] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[56:57], v[24:25] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[54:55], v[22:23] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[52:53], v[20:21] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[50:51], v[18:19] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[48:49], v[16:17] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[46:47], v[14:15] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[44:45], v[12:13] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[42:43], v[10:11] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[40:41], v[8:9] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[38:39], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[36:37], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[34:35], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[0:1] ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[60:63], s[0:1] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[56:59], s[0:1] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[52:55], s[0:1] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[48:51], s[0:1] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[44:47], s[0:1] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[40:43], s[0:1] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[36:39], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[32:35], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..a8d2f64c3c4d9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -5093,43 +5093,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b64_e32 v[36:37], 48 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,6 +5136,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] @@ -5154,28 +5156,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5190,71 +5197,71 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: v_mov_b32_e32 v32, 42 ; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: v_mov_b64_e32 v[36:37], 48 +; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5264,52 +5271,52 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GISEL-NEXT: v_mov_b32_e32 v32, 25 ; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd: @@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: s_nop 6 +; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788f8855a..4e20c999f5309 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,68 +369,69 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x7fc00000 ; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_accvgpr_write_b32 a3, s1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, s0 +; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: v_mov_b32_e32 v18, v16 +; CHECK-NEXT: v_mov_b32_e32 v19, v16 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[2:3], v[4:7] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: v_accvgpr_write_b32 a5, s1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], a[0:1], a[0:1], v[16:19] +; CHECK-NEXT: v_accvgpr_write_b32 a4, s0 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], a[0:1], a[4:5], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[28:31], a[0:1], a[0:1], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[0:1], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], a[0:1], v[4:7] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], a[0:1], a[2:3], v[4:7] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v28 +; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], a[0:1], a[0:1], v[12:15] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], a[0:1], v[0:3] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], a[0:1], a[0:1], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v9, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v10 +; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[0:1], v[24:27] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 ; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], a[0:1], a[0:1], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[2:3], a[0:1], v[8:11] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] -; CHECK-NEXT: s_nop 6 +; CHECK-NEXT: s_nop 3 ; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[4:5], a[0:1], v[4:7] ; CHECK-NEXT: global_store_short v[12:13], v8, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -507,13 +508,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5] ; CHECK-NEXT: s_nop 3 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0] ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -635,46 +636,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: global_store_dwordx4 v32, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v32, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v32, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v32, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v32, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v32, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v32, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v32, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -756,15 +725,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v31 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] -; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] +; CHECK-NEXT: s_nop 3 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] -; CHECK-NEXT: s_nop 8 -; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 3, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: global_store_dwordx2 v[4:5], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index a81d9a458e23a..08f89b32edb20 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -101,8 +101,13 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:9] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] @@ -112,37 +117,75 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: v_mov_b32_e32 v60, 0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a56 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a57 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a58 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a59 +; CHECK-NEXT: global_store_dwordx4 v60, v[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a33 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a34 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a35 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a36 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a37 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a38 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a39 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a40 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a41 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a42 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a43 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a44 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a45 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a46 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a47 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a48 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a49 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a50 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a51 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a52 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a53 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a54 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a55 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a60 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a61 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a62 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a63 +; CHECK-NEXT: global_store_dwordx4 v60, v[24:27], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v60, v[28:31], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[16:19], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[20:23], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[8:11], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[12:15], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[4:7], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -301,16 +344,26 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[8:11] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[12:15] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND @@ -319,39 +372,82 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: v_mov_b32_e32 v60, 0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a56 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a57 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a58 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a59 +; CHECK-NEXT: global_store_dwordx4 v60, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a33 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a34 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a35 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a36 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a37 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a38 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a39 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a40 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a41 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a42 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a43 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a44 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a45 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a46 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a47 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a48 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a49 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a50 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a51 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a52 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a53 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a54 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a55 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a60 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a61 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a62 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a63 +; CHECK-NEXT: global_store_dwordx4 v60, v[24:27], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[28:31], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v60, v[16:19], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v60, v[20:23], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[8:11], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[12:15], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[4:7], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload