From 7ef2da809fea65fd1249e3abb9f94e294c3a6ce8 Mon Sep 17 00:00:00 2001 From: mssefat Date: Thu, 4 Sep 2025 12:31:49 -0400 Subject: [PATCH 01/20] [AMDGPU] Improve register allocation to reduce MFMA hazard NOPs rebased --- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 94 ++ .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 14 + llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 32 + llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 523 +++---- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 542 +++---- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 112 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 270 ++-- ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1292 +++++++++++++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 146 +- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 159 +- .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll | 12 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 69 +- .../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 33 +- 14 files changed, 2373 insertions(+), 929 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..6d2b10bdb5804 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -34,6 +34,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -43,6 +44,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" +static cl::opt EnableRegisterAvoidListForMFMARegs( + "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden, + cl::desc("Enable Register Avoidance for " + "MFMA in GCNPreRAOptimizations stage."), + cl::init(true)); + namespace { class GCNPreRAOptimizationsImpl { @@ -248,6 +255,93 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { bool Changed = false; + // Single pass implementation + if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) { + // Max lookback window for RAW or WAW hazard + constexpr unsigned MaxLookbackWindow = 19; + SIMachineFunctionInfo *MFI = MF.getInfo(); + for (const MachineBasicBlock &MBB : MF) { + + SmallVector>, 16> + RecentMFMAs; + for (const MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot(); + // Handle MFMA instructions + if (SIInstrInfo::isMFMA(MI)) { + SmallVector MFMARegisters; + auto collectMFMARegister = [&](unsigned OpIdx) { + if (OpIdx >= MI.getNumOperands()) + return; + + const MachineOperand &MO = MI.getOperand(OpIdx); + if (MO.isReg() && MO.getReg().isVirtual()) + MFMARegisters.push_back(MO.getReg()); + }; + // Only collect Matrix C (operand 3) and destination (operand 0) + // registers + collectMFMARegister(0); + collectMFMARegister(3); + + if (!MFMARegisters.empty()) { + RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters)); + // Maintain window + if (RecentMFMAs.size() > MaxLookbackWindow) + RecentMFMAs.erase(RecentMFMAs.begin()); + } + continue; + } + bool ShouldCheckReuse = MI.mayLoad() || MI.mayStore() || MI.isCopy() || + SIInstrInfo::isVALU(MI); + // Skip non-relevant instructions, or skip until at least one MFMA is + // encountered + if (!ShouldCheckReuse || RecentMFMAs.empty()) + continue; + + // Process operands that might reuse MFMA registers + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + + const Register CandidateReg = MO.getReg(); + const TargetRegisterClass *CandidateRC = + MRI->getRegClass(CandidateReg); + + // Only process VGPR registers + if (!TRI->isVGPRClass(CandidateRC)) + continue; + + for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) { + const SmallVector &MFMARegs = It->second; + for (Register MFMAReg : MFMARegs) { + // Verify register class compatibility + const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg); + if (!TRI->hasVGPRs(MFMARC)) + continue; + + // Check if MFMA register is dead at current instruction + const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); + if (!MFMAInterval.liveAt(CurrentSlot)) { + + // Add bidirectional avoidance hint + MFI->addRegisterToAvoid(CandidateReg, MFMAReg); + MFI->addRegisterToAvoid(MFMAReg, CandidateReg); + + // Set hint if we found registers to avoid + MRI->setRegAllocationHint( + MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register()); + MRI->setRegAllocationHint(CandidateReg, + AMDGPURI::HasRegisterAvoidanceList, + Register()); + } + } + } + } + } + } + } + for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); if (!LIS->hasInterval(Reg)) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index b7dbb5994ee41..a66e342bef42c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1214,6 +1214,20 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; } AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; } + + // Map of registers to avoid for a given register + DenseMap> RegisterAvoidanceMap; + + void addRegisterToAvoid(Register VirtReg, Register AvoidReg) { + RegisterAvoidanceMap[VirtReg].push_back(AvoidReg); + } + + ArrayRef getRegistersToAvoid(Register VirtReg) const { + auto It = RegisterAvoidanceMap.find(VirtReg); + if (It != RegisterAvoidanceMap.end()) + return It->second; + return ArrayRef(); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 311557909916a..80743157b2724 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3830,6 +3830,38 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg, } return false; } + case AMDGPURI::HasRegisterAvoidanceList: { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + ArrayRef AvoidRegs = MFI->getRegistersToAvoid(VirtReg); + + if (AvoidRegs.empty()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + // Collect physical registers to avoid + SmallSet AvoidPhysRegs; + for (Register AvoidReg : AvoidRegs) { + if (VRM && VRM->hasPhys(AvoidReg)) { + // Virtual register already mapped - try to avoid its physical register + MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg); + for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI) + AvoidPhysRegs.insert(*AI); + } + } + + if (AvoidPhysRegs.empty()) { + // No physical registers added yet - use default order + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + } + + // Prioritize registers that don't conflict with avoided registers + for (MCPhysReg PhysReg : Order) { + if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg)) + Hints.push_back(PhysReg); + } + + return false; + } default: return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7b91ba7bc581f..ed0c580abc952 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -31,9 +31,11 @@ class RegisterBank; struct SGPRSpillBuilder; /// Register allocation hint types. Helps eliminate unneeded COPY with True16 +/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical +/// registers namespace AMDGPURI { -enum { Size16 = 1, Size32 = 2 }; +enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 }; } // end namespace AMDGPURI diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec326327e..d4380fd41310a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -15,9 +15,12 @@ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $vgpr132 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 + ; GCN-NEXT: ; implicit-def: $vgpr115 ; GCN-NEXT: ; implicit-def: $vgpr133 ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -167,46 +170,45 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 + ; GCN-NEXT: v_add_u32_e32 v73, v132, v112 ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v113 + ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v115 ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ; kill: killed $vgpr73 ; GCN-NEXT: ds_read_b128 v[72:75], v94 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $sgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 @@ -411,8 +413,6 @@ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 @@ -440,40 +440,36 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 + ; GCN-NEXT: ; implicit-def: $vgpr96 ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v96 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v112 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v113 ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 + ; GCN-NEXT: v_exp_f32_e32 v165, v57 ; GCN-NEXT: v_exp_f32_e32 v49, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 @@ -499,31 +495,30 @@ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 ; GCN-NEXT: v_exp_f32_e32 v55, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 ; GCN-NEXT: v_exp_f32_e32 v56, v48 ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 + ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 + ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v156, s4, v59, -v134 + ; GCN-NEXT: v_pack_b32_f16 v59, v68, v58 + ; GCN-NEXT: v_pack_b32_f16 v58, v64, v67 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v66 ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] @@ -532,9 +527,15 @@ ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GCN-NEXT: v_fma_f32 v157, s4, v60, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v141, v80 ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 + ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] +<<<<<<< HEAD ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 @@ -542,10 +543,16 @@ ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 +======= + ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] +>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] +<<<<<<< HEAD ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 ; GCN-NEXT: v_exp_f32_e32 v58, v58 ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] @@ -556,13 +563,17 @@ ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 +======= + ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] +>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 +<<<<<<< HEAD ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 ; GCN-NEXT: v_exp_f32_e32 v59, v57 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] @@ -571,249 +582,264 @@ ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 +======= + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95] + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v145, v55 + ; GCN-NEXT: v_exp_f32_e32 v167, v57 + ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GCN-NEXT: v_mul_f32_e32 v168, 0x3fb8aa3b, v157 + ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] +>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v148, v56 ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v149, v145, v148 + ; GCN-NEXT: v_pack_b32_f16 v148, v140, v144 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v156 + ; GCN-NEXT: v_exp_f32_e32 v168, v168 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v153, v140 + ; GCN-NEXT: ; implicit-def: $vgpr140 + ; GCN-NEXT: v_fma_f32 v164, s4, v61, -v134 + ; GCN-NEXT: v_fma_f32 v166, s4, v62, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v169, v141 ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 + ; GCN-NEXT: v_fma_f32 v57, s4, v35, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79] + ; GCN-NEXT: ds_read_b128 v[142:145], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[156:159], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_fma_f32 v164, s4, v33, -v134 + ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v165 + ; GCN-NEXT: v_exp_f32_e32 v170, v146 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v167 ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111] ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v153 + ; GCN-NEXT: v_pack_b32_f16 v62, v169, v147 ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 + ; GCN-NEXT: v_perm_b32 v147, v131, v129, s8 + ; GCN-NEXT: v_pack_b32_f16 v63, v146, v33 ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v148, v33 + ; GCN-NEXT: v_fma_f32 v152, s4, v36, -v134 + ; GCN-NEXT: v_perm_b32 v36, v162, v160, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v149, v168 + ; GCN-NEXT: v_cvt_f16_f32_e32 v155, v170 + ; GCN-NEXT: v_perm_b32 v146, v163, v161, s8 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[58:61], v140 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_exp_f32_e32 v154, v142 + ; GCN-NEXT: v_perm_b32 v142, v162, v160, s8 + ; GCN-NEXT: v_fma_f32 v160, s4, v38, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v157, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v148 + ; GCN-NEXT: v_fma_f32 v156, s4, v37, -v134 ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v143, v130, v128, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111] ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 + ; GCN-NEXT: v_perm_b32 v32, v163, v161, s5 + ; GCN-NEXT: v_exp_f32_e32 v161, v33 ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v136, v[142:143] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v137, v[32:33] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v150, v38 + ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v149, v155 + ; GCN-NEXT: v_exp_f32_e32 v149, v38 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] + ; GCN-NEXT: ds_write_b64 v138, v[146:147] ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr36 ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79] ; GCN-NEXT: ; implicit-def: $vgpr37 ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_exp_f32_e32 v150, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v156, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v154 + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v157 + ; GCN-NEXT: v_fma_f32 v57, s4, v39, -v134 ; GCN-NEXT: ds_read_b128 v[36:39], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 + ; GCN-NEXT: ds_read_b128 v[128:131], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v161 + ; GCN-NEXT: v_exp_f32_e32 v159, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v149 + ; GCN-NEXT: v_fma_f32 v155, s4, v41, -v134 + ; GCN-NEXT: v_fma_f32 v158, s4, v42, -v134 + ; GCN-NEXT: v_fma_f32 v162, s4, v20, -v134 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v34, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v33, v152 + ; GCN-NEXT: v_exp_f32_e32 v60, v32 ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] + ; GCN-NEXT: v_fma_f32 v57, s4, v43, -v134 + ; GCN-NEXT: v_perm_b32 v20, v142, v62, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 + ; GCN-NEXT: v_exp_f32_e32 v61, v36 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v150 + ; GCN-NEXT: v_fma_f32 v155, s4, v46, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v152, v36 + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v156 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v129, s4, v45, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v159 + ; GCN-NEXT: v_exp_f32_e32 v158, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 + ; GCN-NEXT: v_pack_b32_f16 v32, v37, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v57, v36 ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v61 + ; GCN-NEXT: v_fma_f32 v58, s4, v47, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 + ; GCN-NEXT: ds_read_b128 v[36:39], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v128, v40 + ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v152 + ; GCN-NEXT: ds_read_b128 v[44:47], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95] + ; GCN-NEXT: v_fma_f32 v130, s4, v17, -v134 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v131, v40 + ; GCN-NEXT: v_pack_b32_f16 v40, v59, v129 + ; GCN-NEXT: v_fma_f32 v155, s4, v18, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v158 + ; GCN-NEXT: v_exp_f32_e32 v160, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v57 ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v58 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 + ; GCN-NEXT: v_fma_f32 v58, s4, v19, -v134 + ; GCN-NEXT: v_exp_f32_e32 v129, v17 + ; GCN-NEXT: v_perm_b32 v42, v143, v63, s8 + ; GCN-NEXT: v_perm_b32 v43, v147, v145, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 + ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_exp_f32_e32 v163, v36 + ; GCN-NEXT: v_perm_b32 v36, v142, v62, s8 ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v160 + ; GCN-NEXT: v_perm_b32 v21, v146, v144, s5 + ; GCN-NEXT: v_perm_b32 v37, v146, v144, s8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v131 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[20:21] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: v_perm_b32 v16, v143, v63, s5 + ; GCN-NEXT: v_fma_f32 v63, s4, v22, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v129 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v142, v17 + ; GCN-NEXT: v_perm_b32 v17, v147, v145, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[36:37] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v58 ; GCN-NEXT: v_exp_f32_e32 v144, v22 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -836,22 +862,22 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 + ; GCN-NEXT: v_pack_b32_f16 v32, v59, v44 ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v162 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] ; GCN-NEXT: v_exp_f32_e32 v132, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v163 + ; GCN-NEXT: v_fma_f32 v143, s4, v23, -v134 ; GCN-NEXT: ds_read_b128 v[20:23], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -860,20 +886,20 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v26, -v134 + ; GCN-NEXT: v_fma_f32 v147, s4, v4, -v134 ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v142 ; GCN-NEXT: v_exp_f32_e32 v145, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v143 ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 ; GCN-NEXT: v_exp_f32_e32 v35, v16 ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 @@ -895,11 +921,11 @@ ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 + ; GCN-NEXT: v_exp_f32_e32 v143, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v30, -v134 ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] @@ -907,25 +933,25 @@ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 + ; GCN-NEXT: ds_read_b128 v[20:23], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v36, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[28:31], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v39, v24 ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v2, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v143 + ; GCN-NEXT: v_exp_f32_e32 v146, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 @@ -933,25 +959,25 @@ ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 ; GCN-NEXT: v_exp_f32_e32 v34, v1 ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 + ; GCN-NEXT: v_perm_b32 v27, v59, v45, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 + ; GCN-NEXT: v_exp_f32_e32 v155, v20 ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v146 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v38, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 + ; GCN-NEXT: v_perm_b32 v5, v58, v44, s5 + ; GCN-NEXT: v_perm_b32 v21, v58, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND @@ -961,9 +987,9 @@ ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: v_perm_b32 v1, v59, v45, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[20:21] @@ -987,10 +1013,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v147 ; GCN-NEXT: v_exp_f32_e32 v26, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v155 ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1042,10 +1068,10 @@ ; GCN-NEXT: v_exp_f32_e32 v21, v9 ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 + ; GCN-NEXT: ds_read_b128 v[4:7], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[12:15], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -1071,33 +1097,33 @@ ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v168, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v170, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v149, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v60, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v61, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v57, v3 ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v131, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 @@ -1105,14 +1131,14 @@ ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 @@ -1137,17 +1163,18 @@ ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 + ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] + ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] - ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v2, s[6:7] + ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v48 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..be97a1e82fcf2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -10,25 +10,24 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s20, v2 ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr64 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr76 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; implicit-def: $vgpr49 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr50 ; GCN-NEXT: ; implicit-def: $vgpr77 ; GCN-NEXT: ; implicit-def: $vgpr78 ; GCN-NEXT: ; implicit-def: $vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 + ; GCN-NEXT: ; implicit-def: $vgpr81 + ; GCN-NEXT: ; implicit-def: $vgpr103 ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 + ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v64 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -36,8 +35,9 @@ ; GCN-NEXT: s_lshl_b32 s4, s20, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 + ; GCN-NEXT: v_add_u32_e32 v77, s20, v77 + ; GCN-NEXT: v_and_b32_e32 v77, 0x1fffffff, v77 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b128 v48, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,8 +48,8 @@ ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v76 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v76 ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -68,22 +68,22 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_mul_lo_u32 v77, v77, s6 + ; GCN-NEXT: v_add_lshl_u32 v77, v78, v77, 1 ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 + ; GCN-NEXT: v_lshl_add_u32 v80, v81, 1, v79 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: ds_read_b128 v[36:39], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[44:47], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 @@ -107,20 +107,20 @@ ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: ds_read_b128 v[68:71], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 + ; GCN-NEXT: v_add_u32_e32 v83, v32, v76 + ; GCN-NEXT: v_add_u32_e32 v76, v33, v76 ; GCN-NEXT: ; kill: killed $vgpr83 + ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[66:69], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -131,20 +131,20 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 + ; GCN-NEXT: v_max_f32_e32 v82, v67, v67 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v77, v70 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v78, v71 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 + ; GCN-NEXT: ds_write_b32 v79, v72 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 @@ -152,11 +152,11 @@ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v21 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 @@ -166,12 +166,12 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v30 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 + ; GCN-NEXT: v_max3_f32 v64, v64, v81, v84 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 @@ -179,315 +179,315 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v7 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v80, v68 + ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $vgpr87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: v_max_f32_e32 v65, v82, v64 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v65 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v65 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v65 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v65 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v65 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v65 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v65 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v16 + ; GCN-NEXT: v_exp_f32_e32 v73, v17 + ; GCN-NEXT: v_exp_f32_e32 v81, v18 + ; GCN-NEXT: v_exp_f32_e32 v82, v19 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v72 + ; GCN-NEXT: v_fma_f32 v17, s4, v24, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v73 + ; GCN-NEXT: v_fma_f32 v19, s4, v25, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v81 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v65 ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v82 + ; GCN-NEXT: v_pack_b32_f16 v24, v16, v18 + ; GCN-NEXT: v_sub_f32_e32 v22, v67, v65 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_pack_b32_f16 v25, v20, v21 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v19 + ; GCN-NEXT: ds_read_b128 v[16:19], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v65 + ; GCN-NEXT: v_exp_f32_e32 v86, v23 + ; GCN-NEXT: v_exp_f32_e32 v64, v22 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v16, 0, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v83 + ; GCN-NEXT: v_fma_f32 v88, s4, v28, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v84 + ; GCN-NEXT: v_fma_f32 v91, s4, v29, -v65 + ; GCN-NEXT: v_exp_f32_e32 v92, v21 + ; GCN-NEXT: ds_read_b128 v[20:23], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_perm_b32 v99, v70, v68, s2 + ; GCN-NEXT: v_perm_b32 v100, v70, v68, s3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v93, v73, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v94, v85 + ; GCN-NEXT: v_fma_f32 v95, s4, v30, -v65 + ; GCN-NEXT: v_exp_f32_e32 v96, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v86 + ; GCN-NEXT: v_fma_f32 v98, s4, v31, -v65 + ; GCN-NEXT: v_perm_b32 v101, v71, v69, s2 + ; GCN-NEXT: v_perm_b32 v102, v71, v69, s3 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v77, v99 + ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_pack_b32_f16 v76, v76, v90 + ; GCN-NEXT: v_pack_b32_f16 v77, v94, v97 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v78, v100 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v79, v101 + ; GCN-NEXT: v_mul_f32_e32 v78, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_mul_f32_e32 v79, 0x3fb8aa3b, v91 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v81, v81, v93 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v89 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v65 + ; GCN-NEXT: v_exp_f32_e32 v91, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v92 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v65 + ; GCN-NEXT: v_exp_f32_e32 v93, v79 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 + ; GCN-NEXT: ds_write_b32 v80, v102 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v95 + ; GCN-NEXT: v_add_f32_e32 v76, v82, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v96 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v65 + ; GCN-NEXT: v_exp_f32_e32 v80, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v67 + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v98 + ; GCN-NEXT: v_fma_f32 v81, s4, v3, -v65 + ; GCN-NEXT: v_exp_f32_e32 v82, v88 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_pack_b32_f16 v0, v90, v78 + ; GCN-NEXT: v_pack_b32_f16 v1, v77, v79 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v83, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 + ; GCN-NEXT: v_fma_f32 v83, s4, v4, -v65 + ; GCN-NEXT: v_exp_f32_e32 v90, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v93 + ; GCN-NEXT: v_fma_f32 v94, s4, v5, -v65 + ; GCN-NEXT: v_exp_f32_e32 v88, v88 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v68, v84, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v80 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v82 + ; GCN-NEXT: v_pack_b32_f16 v4, v69, v4 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v81 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[0:3], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v73 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v65 + ; GCN-NEXT: v_exp_f32_e32 v73, v69 + ; GCN-NEXT: ds_read_b128 v[76:79], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v85, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v90 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v65 + ; GCN-NEXT: v_exp_f32_e32 v71, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v88 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v65 + ; GCN-NEXT: v_exp_f32_e32 v81, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_add_f32_e32 v68, v86, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v72 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v65 + ; GCN-NEXT: v_exp_f32_e32 v74, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v73 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_fma_f32 v75, s4, v11, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v7 + ; GCN-NEXT: v_pack_b32_f16 v4, v70, v69 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v6 + ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v89, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v71 + ; GCN-NEXT: v_fma_f32 v70, s4, v12, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v81 + ; GCN-NEXT: v_fma_f32 v86, s4, v13, -v65 + ; GCN-NEXT: v_exp_f32_e32 v87, v8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v76, v92, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v74 + ; GCN-NEXT: v_fma_f32 v77, s4, v14, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v83 + ; GCN-NEXT: v_pack_b32_f16 v68, v68, v85 + ; GCN-NEXT: v_mul_f32_e32 v75, 0x3fb8aa3b, v75 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70 + ; GCN-NEXT: v_pack_b32_f16 v69, v69, v92 + ; GCN-NEXT: v_fma_f32 v65, s4, v15, -v65 + ; GCN-NEXT: v_exp_f32_e32 v75, v75 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v76, v96, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v84 + ; GCN-NEXT: v_exp_f32_e32 v92, v70 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v86 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v87 + ; GCN-NEXT: v_exp_f32_e32 v94, v70 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v67, v76 + ; GCN-NEXT: v_add_f32_e32 v67, v91, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v93, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v80, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v82, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v90, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v88, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v72, v67 + ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v77 + ; GCN-NEXT: v_add_f32_e32 v67, v73, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v89 + ; GCN-NEXT: v_exp_f32_e32 v78, v68 + ; GCN-NEXT: v_add_f32_e32 v67, v71, v67 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v75 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_add_f32_e32 v67, v81, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v74, v67 + ; GCN-NEXT: v_pack_b32_f16 v77, v76, v77 + ; GCN-NEXT: v_pack_b32_f16 v76, v85, v86 + ; GCN-NEXT: v_add_f32_e32 v67, v83, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v74, v92 + ; GCN-NEXT: v_add_f32_e32 v67, v84, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v87, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v89, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v75, v67 + ; GCN-NEXT: v_pack_b32_f16 v69, v68, v72 + ; GCN-NEXT: v_pack_b32_f16 v68, v74, v73 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v92, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v94, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v78, v67 + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v67, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v66, v66, v65 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mov_b32_e32 v67, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_cndmask_b32_e64 v65, v66, v65, s[0:1] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63] + ; GCN-NEXT: v_fmac_f32_e32 v65, v67, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 22bc62acce15d..5bef205b3698e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 4 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: @@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0) @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm: @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 @@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -1741,7 +1741,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 @@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit: @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 @@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..b35314b142ede 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -3187,13 +3187,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3214,14 +3210,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8: @@ -3599,13 +3595,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3626,14 +3618,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: @@ -4146,33 +4138,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4256,33 +4247,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4320,33 +4310,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -4523,33 +4512,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4633,33 +4621,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4697,33 +4684,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir new file mode 100644 index 0000000000000..271b36fad2bb4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir @@ -0,0 +1,1292 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s + +--- | + target triple = "amdgcn-amd-amdhsa" + + define amdgpu_kernel void @test_software_pipelining() #0 { + bb.0: + ret void + } + + attributes #0 = {nounwind "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" "frame-pointer"="none"} + +... +--- +name: test_software_pipelining +body: | + bb.0: + ; GFX942_WITHOUT-LABEL: name: test_software_pipelining + ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279 + ; GFX942_WITHOUT-NEXT: S_BARRIER + ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0 + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF + ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279 + ; GFX942_WITHOUT-NEXT: S_BARRIER + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0 + ; GFX942_WITHOUT-NEXT: S_ENDPGM 0 + ; + ; GFX942_WITH-LABEL: name: test_software_pipelining + ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: S_WAITCNT 49279 + ; GFX942_WITH-NEXT: S_BARRIER + ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_BARRIER 0 + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF + ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3) + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: S_WAITCNT 49279 + ; GFX942_WITH-NEXT: S_BARRIER + ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3) + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 + ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 + ; GFX942_WITH-NEXT: SCHED_BARRIER 0 + ; GFX942_WITH-NEXT: S_ENDPGM 0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = IMPLICIT_DEF + %5:sgpr_128 = IMPLICIT_DEF + %6:sgpr_128 = IMPLICIT_DEF + %7:vgpr_32 = IMPLICIT_DEF + %8:vreg_128_align2 = IMPLICIT_DEF + %9:vreg_128_align2 = IMPLICIT_DEF + %10:vreg_128_align2 = IMPLICIT_DEF + %11:vreg_128_align2 = IMPLICIT_DEF + %12:vreg_128_align2 = IMPLICIT_DEF + %13:vreg_128_align2 = IMPLICIT_DEF + %14:vreg_128_align2 = IMPLICIT_DEF + %15:vreg_128_align2 = IMPLICIT_DEF + %16:vreg_128_align2 = IMPLICIT_DEF + %17:vreg_128_align2 = IMPLICIT_DEF + %18:vreg_128_align2 = IMPLICIT_DEF + %19:vreg_128_align2 = IMPLICIT_DEF + %20:vreg_128_align2 = IMPLICIT_DEF + %21:vreg_128_align2 = IMPLICIT_DEF + %22:vreg_128_align2 = IMPLICIT_DEF + %23:vreg_128_align2 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec + %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec + %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec + %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %31:vreg_128_align2 = IMPLICIT_DEF + %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %37:vreg_128_align2 = IMPLICIT_DEF + %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %43:vgpr_32 = IMPLICIT_DEF + %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec + %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %56:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %62:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3) + %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %79:vgpr_32 = IMPLICIT_DEF + %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + %81:vreg_128_align2 = IMPLICIT_DEF + %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %85:vreg_128_align2 = IMPLICIT_DEF + %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %94:vgpr_32 = IMPLICIT_DEF + %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec + %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %102:vgpr_32 = IMPLICIT_DEF + %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec + %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %109:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3) + %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %115:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3) + %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %122:vgpr_32 = IMPLICIT_DEF + %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec + %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %127:vgpr_32 = IMPLICIT_DEF + %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec + %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + S_WAITCNT 49279 + S_BARRIER + %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_BARRIER 0 + %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3) + %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3) + %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3) + %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %176:vgpr_32 = IMPLICIT_DEF + %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3) + %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3) + %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3) + %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3) + %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3) + %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3) + %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3) + %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3) + %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3) + %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + S_WAITCNT 49279 + S_BARRIER + %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3) + %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3) + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 512, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 32, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 8, 1, 0 + SCHED_GROUP_BARRIER 256, 1, 0 + SCHED_BARRIER 0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af96bb8b9..d9f1b542e4cb4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg @@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942-VGPR: ; %bb.0: -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3) @@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 @@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[62:63], v[30:31] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v64, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[60:61], v[28:29] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[58:59], v[26:27] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[56:57], v[24:25] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[54:55], v[22:23] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[52:53], v[20:21] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[50:51], v[18:19] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[48:49], v[16:17] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[46:47], v[14:15] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[44:45], v[12:13] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[42:43], v[10:11] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[40:41], v[8:9] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[38:39], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[36:37], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[34:35], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[0:1] ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[60:63], s[0:1] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[56:59], s[0:1] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[52:55], s[0:1] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[48:51], s[0:1] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[44:47], s[0:1] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[40:43], s[0:1] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[36:39], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[32:35], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..f4f1ca024b7d6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 ; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,6 +5137,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] @@ -5154,28 +5157,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5191,23 +5199,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: v_mov_b32_e32 v32, 42 ; SDAG-NEXT: v_mov_b32_e32 v33, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -5242,19 +5250,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5265,6 +5287,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_mov_b32_e32 v32, 25 ; GISEL-NEXT: v_mov_b32_e32 v33, 42 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] @@ -5296,20 +5321,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd: @@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: s_nop 6 +; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788f8855a..a84b4803b04cc 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b32_e32 v22, 0x7fc00000 ; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 @@ -507,13 +507,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5] ; CHECK-NEXT: s_nop 3 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0] ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -635,46 +635,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: global_store_dwordx4 v32, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v32, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v32, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v32, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v32, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v32, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v32, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v32, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -756,15 +724,18 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b32_e32 v12, v31 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] -; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] +; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v12 +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] ; CHECK-NEXT: s_nop 8 ; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 3, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: global_store_dwordx2 v[4:5], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index a81d9a458e23a..e77856d073a0b 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -311,43 +311,44 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[12:15] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v6, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v6, v[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v6, v[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v6, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v6, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v6, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v6, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v6, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v6, a[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v6, a[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v6, a[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v6, a[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v6, a[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v6, a[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v6, a[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v6, a[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) From 7d3f0152d2f1434b2de811fbc0da5acfacacf2b1 Mon Sep 17 00:00:00 2001 From: mssefat Date: Fri, 19 Sep 2025 19:47:42 -0400 Subject: [PATCH 02/20] Rebase --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 6d2b10bdb5804..ed349fccfa3e4 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -48,7 +48,7 @@ static cl::opt EnableRegisterAvoidListForMFMARegs( "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden, cl::desc("Enable Register Avoidance for " "MFMA in GCNPreRAOptimizations stage."), - cl::init(true)); + cl::init(false)); namespace { From f8df624fc700256864eae2b2645232b38ff2d553 Mon Sep 17 00:00:00 2001 From: mssefat Date: Fri, 19 Sep 2025 20:05:14 -0400 Subject: [PATCH 03/20] rebase test files --- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 523 ++++++++--------- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 542 +++++++++--------- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 104 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 270 ++++----- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 146 ++--- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 159 +++-- .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll | 12 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 69 ++- .../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 33 +- 9 files changed, 924 insertions(+), 934 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index d4380fd41310a..b07dec326327e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -15,12 +15,9 @@ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr112 - ; GCN-NEXT: ; implicit-def: $vgpr113 - ; GCN-NEXT: ; implicit-def: $vgpr114 - ; GCN-NEXT: ; implicit-def: $vgpr115 ; GCN-NEXT: ; implicit-def: $vgpr133 ; GCN-NEXT: ; implicit-def: $vgpr139 + ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -170,45 +167,46 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 + ; GCN-NEXT: ; implicit-def: $vgpr73 + ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: v_add_u32_e32 v73, v132, v112 ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v113 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 + ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v114 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 + ; GCN-NEXT: ; implicit-def: $vgpr75 ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v115 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; kill: killed $vgpr73 ; GCN-NEXT: ds_read_b128 v[72:75], v94 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: ; implicit-def: $vgpr112 - ; GCN-NEXT: ; implicit-def: $vgpr113 - ; GCN-NEXT: ; implicit-def: $vgpr114 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 @@ -413,6 +411,8 @@ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: ; implicit-def: $vgpr68 + ; GCN-NEXT: ; implicit-def: $vgpr67 ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 @@ -440,36 +440,40 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: ; implicit-def: $vgpr96 + ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v96 + ; GCN-NEXT: ; implicit-def: $vgpr64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v112 - ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 + ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v113 + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v114 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 + ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 + ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_exp_f32_e32 v163, v57 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v165, v57 + ; GCN-NEXT: v_exp_f32_e32 v164, v57 ; GCN-NEXT: v_exp_f32_e32 v49, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 @@ -495,30 +499,31 @@ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 ; GCN-NEXT: v_exp_f32_e32 v55, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 + ; GCN-NEXT: v_exp_f32_e32 v56, v48 + ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_fma_f32 v156, s4, v59, -v134 - ; GCN-NEXT: v_pack_b32_f16 v59, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v58, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 + ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 + ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 + ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] @@ -527,15 +532,9 @@ ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_fma_f32 v157, s4, v60, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v141, v80 ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] -<<<<<<< HEAD ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 @@ -543,16 +542,10 @@ ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 -======= - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] ->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] -<<<<<<< HEAD ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 ; GCN-NEXT: v_exp_f32_e32 v58, v58 ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] @@ -563,17 +556,13 @@ ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 -======= - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] ->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 -<<<<<<< HEAD ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 ; GCN-NEXT: v_exp_f32_e32 v59, v57 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] @@ -582,264 +571,249 @@ ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 -======= - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v54 - ; GCN-NEXT: v_cvt_f16_f32_e32 v145, v55 - ; GCN-NEXT: v_exp_f32_e32 v167, v57 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: v_mul_f32_e32 v168, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] ->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs) ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v148, v56 ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v149, v145, v148 - ; GCN-NEXT: v_pack_b32_f16 v148, v140, v144 - ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_exp_f32_e32 v168, v168 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v153, v140 - ; GCN-NEXT: ; implicit-def: $vgpr140 - ; GCN-NEXT: v_fma_f32 v164, s4, v61, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v62, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v169, v141 + ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 + ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: v_fma_f32 v57, s4, v35, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79] - ; GCN-NEXT: ds_read_b128 v[142:145], v140 + ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 + ; GCN-NEXT: ; implicit-def: $vgpr57 + ; GCN-NEXT: ds_read_b128 v[60:63], v57 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[156:159], v140 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v160, v149 + ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 + ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 + ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 + ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 + ; GCN-NEXT: v_exp_f32_e32 v162, v146 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v164 - ; GCN-NEXT: v_fma_f32 v164, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v165 - ; GCN-NEXT: v_exp_f32_e32 v170, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v167 + ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v153 - ; GCN-NEXT: v_pack_b32_f16 v62, v169, v147 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_perm_b32 v147, v131, v129, s8 - ; GCN-NEXT: v_pack_b32_f16 v63, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v148, v33 - ; GCN-NEXT: v_fma_f32 v152, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v162, v160, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v149, v168 - ; GCN-NEXT: v_cvt_f16_f32_e32 v155, v170 - ; GCN-NEXT: v_perm_b32 v146, v163, v161, s8 ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[58:61], v140 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v164 - ; GCN-NEXT: v_exp_f32_e32 v154, v142 - ; GCN-NEXT: v_perm_b32 v142, v162, v160, s8 - ; GCN-NEXT: v_fma_f32 v160, s4, v38, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v157, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v148 - ; GCN-NEXT: v_fma_f32 v156, s4, v37, -v134 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 + ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 + ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] + ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 + ; GCN-NEXT: v_exp_f32_e32 v153, v33 + ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 + ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 + ; GCN-NEXT: v_exp_f32_e32 v165, v60 + ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 + ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v161, v61 + ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v143, v130, v128, s8 + ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 + ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] + ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_perm_b32 v32, v163, v161, s5 - ; GCN-NEXT: v_exp_f32_e32 v161, v33 + ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 + ; GCN-NEXT: v_exp_f32_e32 v159, v33 ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 + ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_exp_f32_e32 v152, v38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[142:143] + ; GCN-NEXT: ds_write_b64 v136, v[60:61] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v59, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_pack_b32_f16 v58, v149, v155 - ; GCN-NEXT: v_exp_f32_e32 v149, v38 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[146:147] + ; GCN-NEXT: ds_write_b64 v138, v[140:141] ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr36 ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79] ; GCN-NEXT: ; implicit-def: $vgpr37 ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v150, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 + ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v154 - ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v157 - ; GCN-NEXT: v_fma_f32 v57, s4, v39, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 + ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 + ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 ; GCN-NEXT: ds_read_b128 v[36:39], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[128:131], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v161 - ; GCN-NEXT: v_exp_f32_e32 v159, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v149 - ; GCN-NEXT: v_fma_f32 v155, s4, v41, -v134 - ; GCN-NEXT: v_fma_f32 v158, s4, v42, -v134 - ; GCN-NEXT: v_fma_f32 v162, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v59, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_pack_b32_f16 v58, v33, v152 - ; GCN-NEXT: v_exp_f32_e32 v60, v32 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v154, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 + ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 + ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v155, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 + ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 + ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 + ; GCN-NEXT: v_exp_f32_e32 v157, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 + ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 + ; GCN-NEXT: v_exp_f32_e32 v146, v32 ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v43, -v134 - ; GCN-NEXT: v_perm_b32 v20, v142, v62, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79] + ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 + ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v61, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v155 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v150 - ; GCN-NEXT: v_fma_f32 v155, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v152, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v156 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_fma_f32 v129, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v159 - ; GCN-NEXT: v_exp_f32_e32 v158, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 + ; GCN-NEXT: v_exp_f32_e32 v147, v36 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v143, v36 + ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 + ; GCN-NEXT: v_exp_f32_e32 v156, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v57, v36 + ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v129, v36 ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v61 - ; GCN-NEXT: v_fma_f32 v58, s4, v47, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 + ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v140 + ; GCN-NEXT: ds_read_b128 v[36:39], v57 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v128, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v129 - ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v152 - ; GCN-NEXT: ds_read_b128 v[44:47], v140 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v142, v40 + ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 + ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v130, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 - ; GCN-NEXT: v_exp_f32_e32 v131, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v59, v129 - ; GCN-NEXT: v_fma_f32 v155, s4, v18, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] + ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 + ; GCN-NEXT: v_exp_f32_e32 v63, v40 + ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 + ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 + ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v158 - ; GCN-NEXT: v_exp_f32_e32 v160, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v57 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 + ; GCN-NEXT: v_exp_f32_e32 v158, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v58 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_fma_f32 v58, s4, v19, -v134 - ; GCN-NEXT: v_exp_f32_e32 v129, v17 - ; GCN-NEXT: v_perm_b32 v42, v143, v63, s8 - ; GCN-NEXT: v_perm_b32 v43, v147, v145, s8 + ; GCN-NEXT: v_exp_f32_e32 v128, v17 + ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 + ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1152 + ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1728 + ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v130 - ; GCN-NEXT: v_exp_f32_e32 v163, v36 - ; GCN-NEXT: v_perm_b32 v36, v142, v62, s8 + ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_exp_f32_e32 v167, v36 + ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v160 - ; GCN-NEXT: v_perm_b32 v21, v146, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v146, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v131 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 + ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 + ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[20:21] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v143, v63, s5 - ; GCN-NEXT: v_fma_f32 v63, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v129 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 - ; GCN-NEXT: v_exp_f32_e32 v142, v17 - ; GCN-NEXT: v_perm_b32 v17, v147, v145, s5 + ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 + ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 + ; GCN-NEXT: v_exp_f32_e32 v140, v17 + ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[36:37] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v58 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 ; GCN-NEXT: v_exp_f32_e32 v144, v22 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -862,22 +836,22 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v59, v44 + ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v162 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] ; GCN-NEXT: v_exp_f32_e32 v132, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v163 - ; GCN-NEXT: v_fma_f32 v143, s4, v23, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 + ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 ; GCN-NEXT: ds_read_b128 v[20:23], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -886,20 +860,20 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v63, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v147, s4, v4, -v134 + ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 + ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 ; GCN-NEXT: v_exp_f32_e32 v145, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 ; GCN-NEXT: v_exp_f32_e32 v35, v16 ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 @@ -921,11 +895,11 @@ ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v143, v16 + ; GCN-NEXT: v_exp_f32_e32 v141, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v63, s4, v30, -v134 + ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] @@ -933,25 +907,25 @@ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v140 + ; GCN-NEXT: ds_read_b128 v[20:23], v57 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v36, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v140 offset:576 + ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 ; GCN-NEXT: v_exp_f32_e32 v39, v24 ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v63, s4, v2, -v134 + ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v143 - ; GCN-NEXT: v_exp_f32_e32 v146, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 + ; GCN-NEXT: v_exp_f32_e32 v148, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 @@ -959,25 +933,25 @@ ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 ; GCN-NEXT: v_exp_f32_e32 v34, v1 ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v59, v45, s8 + ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1728 + ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v155, v20 + ; GCN-NEXT: v_exp_f32_e32 v150, v20 ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v146 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v38, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v58, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v58, v44, s8 + ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 + ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND @@ -987,9 +961,9 @@ ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v59, v45, s5 + ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[20:21] @@ -1013,10 +987,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v147 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 ; GCN-NEXT: v_exp_f32_e32 v26, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v155 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,10 +1042,10 @@ ; GCN-NEXT: v_exp_f32_e32 v21, v9 ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v140 + ; GCN-NEXT: ds_read_b128 v[4:7], v57 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v140 offset:576 + ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -1097,33 +1071,33 @@ ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v168, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v170, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v149, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v60, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v61, v3 ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v57, v3 ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v131, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 @@ -1131,14 +1105,14 @@ ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 @@ -1163,18 +1137,17 @@ ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v2, s[6:7] - ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1728 + ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] + ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 + ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index be97a1e82fcf2..0887fdf0844b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -10,24 +10,25 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s20, v2 ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr50 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; implicit-def: $vgpr49 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr51 + ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 + ; GCN-NEXT: ; implicit-def: $vgpr76 ; GCN-NEXT: ; implicit-def: $vgpr77 ; GCN-NEXT: ; implicit-def: $vgpr78 ; GCN-NEXT: ; implicit-def: $vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr81 - ; GCN-NEXT: ; implicit-def: $vgpr103 + ; GCN-NEXT: ; implicit-def: $vgpr91 ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v64 + ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -35,9 +36,8 @@ ; GCN-NEXT: s_lshl_b32 s4, s20, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: v_add_u32_e32 v77, s20, v77 - ; GCN-NEXT: v_and_b32_e32 v77, 0x1fffffff, v77 + ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 + ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b128 v48, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,8 +48,8 @@ ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v76 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v76 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -68,22 +68,22 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v77, v77, s6 - ; GCN-NEXT: v_add_lshl_u32 v77, v78, v77, 1 - ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 + ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 + ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 + ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 + ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v80, v81, 1, v79 + ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v50 + ; GCN-NEXT: ds_read_b128 v[36:39], v51 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v50 offset:512 + ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 @@ -107,20 +107,20 @@ ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v50 + ; GCN-NEXT: ds_read_b128 v[68:71], v51 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v83, v32, v76 - ; GCN-NEXT: v_add_u32_e32 v76, v33, v76 + ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 + ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 + ; GCN-NEXT: ; kill: killed $vgpr82 ; GCN-NEXT: ; kill: killed $vgpr83 - ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v50 offset:512 + ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -131,20 +131,20 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v82, v67, v67 + ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v77, v70 + ; GCN-NEXT: ds_write_b32 v76, v70 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v71 + ; GCN-NEXT: ds_write_b32 v77, v71 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v72 + ; GCN-NEXT: ds_write_b32 v78, v72 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 @@ -152,11 +152,11 @@ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v21 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 @@ -166,12 +166,12 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v81, v84 + ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 @@ -179,315 +179,315 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v7 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 + ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 + ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 + ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v80, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 + ; GCN-NEXT: ds_write_b32 v79, v68 + ; GCN-NEXT: ; implicit-def: $vgpr84 + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 + ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; implicit-def: $vgpr87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: v_max_f32_e32 v65, v82, v64 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v65 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v65 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v65 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v65 + ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 + ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v65 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v65 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v65 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v65 - ; GCN-NEXT: v_exp_f32_e32 v72, v16 - ; GCN-NEXT: v_exp_f32_e32 v73, v17 - ; GCN-NEXT: v_exp_f32_e32 v81, v18 - ; GCN-NEXT: v_exp_f32_e32 v82, v19 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 + ; GCN-NEXT: v_exp_f32_e32 v73, v16 + ; GCN-NEXT: v_exp_f32_e32 v74, v18 + ; GCN-NEXT: v_exp_f32_e32 v75, v19 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v72 - ; GCN-NEXT: v_fma_f32 v17, s4, v24, -v65 - ; GCN-NEXT: v_exp_f32_e32 v83, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v73 - ; GCN-NEXT: v_fma_f32 v19, s4, v25, -v65 - ; GCN-NEXT: v_exp_f32_e32 v84, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v81 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v65 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v82 - ; GCN-NEXT: v_pack_b32_f16 v24, v16, v18 - ; GCN-NEXT: v_sub_f32_e32 v22, v67, v65 + ; GCN-NEXT: v_exp_f32_e32 v80, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 + ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 + ; GCN-NEXT: v_exp_f32_e32 v81, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 + ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 + ; GCN-NEXT: v_exp_f32_e32 v82, v22 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_pack_b32_f16 v25, v20, v21 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v19 - ; GCN-NEXT: ds_read_b128 v[16:19], v87 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 + ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 + ; GCN-NEXT: v_exp_f32_e32 v83, v23 + ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 + ; GCN-NEXT: v_exp_f32_e32 v85, v22 + ; GCN-NEXT: v_exp_f32_e32 v17, v17 + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 + ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 + ; GCN-NEXT: v_exp_f32_e32 v88, v23 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 + ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 + ; GCN-NEXT: ds_read_b128 v[18:21], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v65 - ; GCN-NEXT: v_exp_f32_e32 v86, v23 - ; GCN-NEXT: v_exp_f32_e32 v64, v22 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v16, 0, v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v83 - ; GCN-NEXT: v_fma_f32 v88, s4, v28, -v65 - ; GCN-NEXT: v_exp_f32_e32 v89, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v84 - ; GCN-NEXT: v_fma_f32 v91, s4, v29, -v65 - ; GCN-NEXT: v_exp_f32_e32 v92, v21 - ; GCN-NEXT: ds_read_b128 v[20:23], v87 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v16, v24 + ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0] - ; GCN-NEXT: v_perm_b32 v99, v70, v68, s2 - ; GCN-NEXT: v_perm_b32 v100, v70, v68, s3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v93, v73, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v94, v85 - ; GCN-NEXT: v_fma_f32 v95, s4, v30, -v65 - ; GCN-NEXT: v_exp_f32_e32 v96, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v86 - ; GCN-NEXT: v_fma_f32 v98, s4, v31, -v65 - ; GCN-NEXT: v_perm_b32 v101, v71, v69, s2 - ; GCN-NEXT: v_perm_b32 v102, v71, v69, s3 - ; GCN-NEXT: ds_read_b128 v[68:71], v103 + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 + ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 + ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 + ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 + ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 + ; GCN-NEXT: v_exp_f32_e32 v30, v18 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 + ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 + ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 + ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 + ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 + ; GCN-NEXT: ds_read_b128 v[26:29], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 + ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v77, v99 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_pack_b32_f16 v76, v76, v90 - ; GCN-NEXT: v_pack_b32_f16 v77, v94, v97 + ; GCN-NEXT: ds_write_b32 v76, v31 + ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_exp_f32_e32 v31, v31 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 + ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v100 + ; GCN-NEXT: ds_write_b32 v77, v64 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v101 - ; GCN-NEXT: v_mul_f32_e32 v78, 0x3fb8aa3b, v88 - ; GCN-NEXT: v_mul_f32_e32 v79, 0x3fb8aa3b, v91 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v81, v81, v93 - ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v89 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v65 - ; GCN-NEXT: v_exp_f32_e32 v91, v78 - ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v92 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v65 - ; GCN-NEXT: v_exp_f32_e32 v93, v79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47] + ; GCN-NEXT: ds_write_b32 v78, v90 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v80, v102 - ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v95 - ; GCN-NEXT: v_add_f32_e32 v76, v82, v81 - ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v96 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v65 - ; GCN-NEXT: v_exp_f32_e32 v80, v80 - ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v67 - ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v98 - ; GCN-NEXT: v_fma_f32 v81, s4, v3, -v65 - ; GCN-NEXT: v_exp_f32_e32 v82, v88 + ; GCN-NEXT: ds_write_b32 v79, v65 + ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 + ; GCN-NEXT: v_exp_f32_e32 v22, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 + ; GCN-NEXT: v_exp_f32_e32 v64, v65 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 + ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 + ; GCN-NEXT: v_exp_f32_e32 v23, v23 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v90, v78 - ; GCN-NEXT: v_pack_b32_f16 v1, v77, v79 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 + ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 + ; GCN-NEXT: v_exp_f32_e32 v25, v67 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 + ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 + ; GCN-NEXT: v_exp_f32_e32 v27, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 + ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 + ; GCN-NEXT: v_exp_f32_e32 v68, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v68, v83, v76 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 - ; GCN-NEXT: v_fma_f32 v83, s4, v4, -v65 - ; GCN-NEXT: v_exp_f32_e32 v90, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v93 - ; GCN-NEXT: v_fma_f32 v94, s4, v5, -v65 - ; GCN-NEXT: v_exp_f32_e32 v88, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v68, v84, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v80 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v65 - ; GCN-NEXT: v_exp_f32_e32 v72, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v82 - ; GCN-NEXT: v_pack_b32_f16 v4, v69, v4 - ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v87 + ; GCN-NEXT: ds_read_b128 v[0:3], v84 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v73 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v65 - ; GCN-NEXT: v_exp_f32_e32 v73, v69 - ; GCN-NEXT: ds_read_b128 v[76:79], v87 offset:576 + ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 + ; GCN-NEXT: v_exp_f32_e32 v24, v24 + ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v83 - ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v94 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v68, v85, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v90 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v65 - ; GCN-NEXT: v_exp_f32_e32 v71, v69 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v88 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v65 - ; GCN-NEXT: v_exp_f32_e32 v81, v81 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 + ; GCN-NEXT: v_exp_f32_e32 v26, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 + ; GCN-NEXT: v_exp_f32_e32 v67, v67 ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_add_f32_e32 v68, v86, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v72 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v65 - ; GCN-NEXT: v_exp_f32_e32 v74, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v73 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 + ; GCN-NEXT: v_exp_f32_e32 v6, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_fma_f32 v75, s4, v11, -v65 - ; GCN-NEXT: v_exp_f32_e32 v83, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v70, v69 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v6 - ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 + ; GCN-NEXT: v_exp_f32_e32 v7, v7 + ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v89, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v71 - ; GCN-NEXT: v_fma_f32 v70, s4, v12, -v65 - ; GCN-NEXT: v_exp_f32_e32 v84, v7 - ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v81 - ; GCN-NEXT: v_fma_f32 v86, s4, v13, -v65 - ; GCN-NEXT: v_exp_f32_e32 v87, v8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v76, v92, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v74 - ; GCN-NEXT: v_fma_f32 v77, s4, v14, -v65 - ; GCN-NEXT: v_exp_f32_e32 v89, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v83 - ; GCN-NEXT: v_pack_b32_f16 v68, v68, v85 - ; GCN-NEXT: v_mul_f32_e32 v75, 0x3fb8aa3b, v75 - ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_pack_b32_f16 v69, v69, v92 - ; GCN-NEXT: v_fma_f32 v65, s4, v15, -v65 - ; GCN-NEXT: v_exp_f32_e32 v75, v75 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v76, v96, v76 - ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v84 - ; GCN-NEXT: v_exp_f32_e32 v92, v70 - ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v86 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v87 - ; GCN-NEXT: v_exp_f32_e32 v94, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v67, v67, v76 - ; GCN-NEXT: v_add_f32_e32 v67, v91, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v93, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v80, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v82, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v90, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v88, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v72, v67 - ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v77 - ; GCN-NEXT: v_add_f32_e32 v67, v73, v67 - ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v89 - ; GCN-NEXT: v_exp_f32_e32 v78, v68 - ; GCN-NEXT: v_add_f32_e32 v67, v71, v67 - ; GCN-NEXT: ds_read_b128 v[68:71], v103 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 + ; GCN-NEXT: v_exp_f32_e32 v10, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 + ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 + ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 + ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 + ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 + ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 + ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 + ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 + ; GCN-NEXT: v_exp_f32_e32 v8, v8 + ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 + ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 + ; GCN-NEXT: v_exp_f32_e32 v9, v9 + ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 + ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 + ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 + ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 + ; GCN-NEXT: v_exp_f32_e32 v11, v11 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 + ; GCN-NEXT: v_exp_f32_e32 v12, v3 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 + ; GCN-NEXT: v_exp_f32_e32 v17, v1 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 + ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 + ; GCN-NEXT: v_exp_f32_e32 v15, v3 + ; GCN-NEXT: v_exp_f32_e32 v18, v1 + ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 + ; GCN-NEXT: ds_read_b128 v[0:3], v91 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v75 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_add_f32_e32 v67, v81, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v74, v67 - ; GCN-NEXT: v_pack_b32_f16 v77, v76, v77 - ; GCN-NEXT: v_pack_b32_f16 v76, v85, v86 - ; GCN-NEXT: v_add_f32_e32 v67, v83, v67 - ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v65 - ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v94 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v78 - ; GCN-NEXT: v_cvt_f16_f32_e32 v74, v92 - ; GCN-NEXT: v_add_f32_e32 v67, v84, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v87, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v89, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v75, v67 - ; GCN-NEXT: v_pack_b32_f16 v69, v68, v72 - ; GCN-NEXT: v_pack_b32_f16 v68, v74, v73 - ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 + ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 + ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 + ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 + ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 + ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 + ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 + ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 + ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 + ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 + ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 + ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 + ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 + ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 + ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_f32_e32 v67, v92, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v94, v67 - ; GCN-NEXT: v_add_f32_e32 v67, v78, v67 - ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 - ; GCN-NEXT: ds_bpermute_b32 v67, v66, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 - ; GCN-NEXT: ds_bpermute_b32 v66, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v67, 0 + ; GCN-NEXT: v_mov_b32_e32 v4, 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 + ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 + ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v65, v66, v65, s[0:1] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63] - ; GCN-NEXT: v_fmac_f32_e32 v65, v67, v64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47] + ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] + ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 5bef205b3698e..ed3d1399e5926 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 4 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: @@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0) @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm: @@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit: @@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index b35314b142ede..ab0000f6831b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -3187,9 +3187,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 +; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 +; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3210,14 +3214,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8: @@ -3595,9 +3599,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 +; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 +; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3618,14 +3626,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: @@ -4138,32 +4146,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s8 -; SDAG-NEXT: v_mov_b32_e32 v33, s9 -; SDAG-NEXT: v_mov_b32_e32 v34, s10 -; SDAG-NEXT: v_mov_b32_e32 v35, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4247,32 +4256,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s16 -; HEURRC-NEXT: v_mov_b32_e32 v33, s17 -; HEURRC-NEXT: v_mov_b32_e32 v34, s18 -; HEURRC-NEXT: v_mov_b32_e32 v35, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s12 -; HEURRC-NEXT: v_mov_b32_e32 v33, s13 -; HEURRC-NEXT: v_mov_b32_e32 v34, s14 -; HEURRC-NEXT: v_mov_b32_e32 v35, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s8 -; HEURRC-NEXT: v_mov_b32_e32 v33, s9 -; HEURRC-NEXT: v_mov_b32_e32 v34, s10 -; HEURRC-NEXT: v_mov_b32_e32 v35, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4310,32 +4320,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_nop 6 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -4512,32 +4523,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s8 -; SDAG-NEXT: v_mov_b32_e32 v33, s9 -; SDAG-NEXT: v_mov_b32_e32 v34, s10 -; SDAG-NEXT: v_mov_b32_e32 v35, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4621,32 +4633,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s16 -; HEURRC-NEXT: v_mov_b32_e32 v33, s17 -; HEURRC-NEXT: v_mov_b32_e32 v34, s18 -; HEURRC-NEXT: v_mov_b32_e32 v35, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s12 -; HEURRC-NEXT: v_mov_b32_e32 v33, s13 -; HEURRC-NEXT: v_mov_b32_e32 v34, s14 -; HEURRC-NEXT: v_mov_b32_e32 v35, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v32, s8 -; HEURRC-NEXT: v_mov_b32_e32 v33, s9 -; HEURRC-NEXT: v_mov_b32_e32 v34, s10 -; HEURRC-NEXT: v_mov_b32_e32 v35, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4684,32 +4697,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_nop 6 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index d9f1b542e4cb4..7e30af96bb8b9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg @@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942-VGPR: ; %bb.0: -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3) @@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -5542,8 +5542,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 @@ -5572,37 +5570,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[62:63], v[30:31] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v64, 2.0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[60:61], v[28:29] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[58:59], v[26:27] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[56:57], v[24:25] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[54:55], v[22:23] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[52:53], v[20:21] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[50:51], v[18:19] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[48:49], v[16:17] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[46:47], v[14:15] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[44:45], v[12:13] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[42:43], v[10:11] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[40:41], v[8:9] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[38:39], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[36:37], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[34:35], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63] +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[60:63], s[0:1] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[56:59], s[0:1] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[52:55], s[0:1] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[48:51], s[0:1] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[44:47], s[0:1] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[40:43], s[0:1] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[36:39], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[32:35], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f4f1ca024b7d6..f0205a3a788ed 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 ; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 -; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s8 -; SDAG-NEXT: v_mov_b32_e32 v33, s9 -; SDAG-NEXT: v_mov_b32_e32 v34, s10 -; SDAG-NEXT: v_mov_b32_e32 v35, s11 -; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,9 +5137,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 -; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 -; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] @@ -5157,33 +5154,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 -; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5199,23 +5191,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: v_mov_b32_e32 v32, 42 ; SDAG-NEXT: v_mov_b32_e32 v33, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -5250,33 +5242,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s16 -; SDAG-NEXT: v_mov_b32_e32 v33, s17 -; SDAG-NEXT: v_mov_b32_e32 v34, s18 -; SDAG-NEXT: v_mov_b32_e32 v35, s19 -; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v32, s8 -; SDAG-NEXT: v_mov_b32_e32 v33, s9 -; SDAG-NEXT: v_mov_b32_e32 v34, s10 -; SDAG-NEXT: v_mov_b32_e32 v35, s11 -; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5287,9 +5265,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_mov_b32_e32 v32, 25 ; GISEL-NEXT: v_mov_b32_e32 v33, 42 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 -; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 -; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] @@ -5321,20 +5296,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index ef3bb0cb5f4f1..5475fa2ae5c6e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd: @@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: s_nop 6 -; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index a84b4803b04cc..9a23788f8855a 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v22, 0x7fc00000 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 @@ -507,13 +507,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5] +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] ; CHECK-NEXT: s_nop 3 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -635,14 +635,46 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: global_store_dwordx4 v32, a[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, a[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, a[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, a[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, a[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, a[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, a[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, a[4:7], s[2:3] offset:16 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a4 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a5 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a6 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a7 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a8 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a9 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a10 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a11 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a12 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a13 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a14 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a15 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a16 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a17 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a18 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a19 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a20 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 +; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -724,18 +756,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v12, v31 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] -; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v12 -; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] ; CHECK-NEXT: s_nop 8 ; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off -; CHECK-NEXT: v_lshlrev_b32_e32 v4, 3, v12 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: global_store_dwordx2 v[4:5], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index e77856d073a0b..a81d9a458e23a 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -311,44 +311,43 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[12:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dwordx4 v6, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v6, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) From 2d8ef8b52072e0d4d2594ca7a7352f15dbf9c3f5 Mon Sep 17 00:00:00 2001 From: mssefat Date: Fri, 19 Sep 2025 23:31:46 -0400 Subject: [PATCH 04/20] Resotred SIRegisterInfo files --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 32 ----------------------- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +-- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 80743157b2724..311557909916a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3830,38 +3830,6 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg, } return false; } - case AMDGPURI::HasRegisterAvoidanceList: { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - ArrayRef AvoidRegs = MFI->getRegistersToAvoid(VirtReg); - - if (AvoidRegs.empty()) - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); - // Collect physical registers to avoid - SmallSet AvoidPhysRegs; - for (Register AvoidReg : AvoidRegs) { - if (VRM && VRM->hasPhys(AvoidReg)) { - // Virtual register already mapped - try to avoid its physical register - MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg); - for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI) - AvoidPhysRegs.insert(*AI); - } - } - - if (AvoidPhysRegs.empty()) { - // No physical registers added yet - use default order - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); - } - - // Prioritize registers that don't conflict with avoided registers - for (MCPhysReg PhysReg : Order) { - if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg)) - Hints.push_back(PhysReg); - } - - return false; - } default: return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index ed0c580abc952..7b91ba7bc581f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -31,11 +31,9 @@ class RegisterBank; struct SGPRSpillBuilder; /// Register allocation hint types. Helps eliminate unneeded COPY with True16 -/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical -/// registers namespace AMDGPURI { -enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 }; +enum { Size16 = 1, Size32 = 2 }; } // end namespace AMDGPURI From f0f214b0e4155de1342d7d1f93cda06f997924b9 Mon Sep 17 00:00:00 2001 From: mssefat Date: Fri, 19 Sep 2025 23:33:47 -0400 Subject: [PATCH 05/20] Resotred SIMachineFunctionInfo files --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index a66e342bef42c..b7dbb5994ee41 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1214,20 +1214,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; } AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; } - - // Map of registers to avoid for a given register - DenseMap> RegisterAvoidanceMap; - - void addRegisterToAvoid(Register VirtReg, Register AvoidReg) { - RegisterAvoidanceMap[VirtReg].push_back(AvoidReg); - } - - ArrayRef getRegistersToAvoid(Register VirtReg) const { - auto It = RegisterAvoidanceMap.find(VirtReg); - if (It != RegisterAvoidanceMap.end()) - return It->second; - return ArrayRef(); - } }; } // end namespace llvm From 9b0ca6f8e260f8cd70a87e11d88c0eaf05c99b82 Mon Sep 17 00:00:00 2001 From: mssefat Date: Sun, 21 Sep 2025 00:17:36 -0400 Subject: [PATCH 06/20] Updated sources to support anti-hint mechanism --- .../include/llvm/CodeGen/MIRParser/MIParser.h | 1 + llvm/include/llvm/CodeGen/MIRYamlMapping.h | 3 + .../llvm/CodeGen/MachineRegisterInfo.h | 56 +++++++++++++++ llvm/lib/CodeGen/AllocationOrder.cpp | 68 ++++++++++++++++++- llvm/lib/CodeGen/AllocationOrder.h | 7 ++ llvm/lib/CodeGen/MIRParser/MIRParser.cpp | 19 ++++++ llvm/lib/CodeGen/MIRPrinter.cpp | 11 +++ llvm/lib/CodeGen/MachineRegisterInfo.cpp | 27 ++++++++ .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 14 +--- 9 files changed, 192 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h index 0f2898d3554d0..1d0a745d5f983 100644 --- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h +++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h @@ -45,6 +45,7 @@ struct VRegInfo { } D; Register VReg; Register PreferredReg; + SmallVector AntiHints; // Anti-hints uint8_t Flags = 0; }; diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index e80c13885805b..24fac0235e960 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -192,6 +192,7 @@ struct VirtualRegisterDefinition { StringValue Class; StringValue PreferredRegister; std::vector RegisterFlags; + std::vector AntiHints; // TODO: Serialize the target specific register hints. @@ -209,6 +210,8 @@ template <> struct MappingTraits { StringValue()); // Don't print out when it's empty. YamlIO.mapOptional("flags", Reg.RegisterFlags, std::vector()); + YamlIO.mapOptional("anti-hints", Reg.AntiHints, + std::vector()); // For anti-hints. } static const bool flow = true; diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 27b30bd5929ff..bcee5d6b30439 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -42,6 +42,7 @@ namespace llvm { class PSetIterator; +class VirtRegMap; /// Convenient type to represent either a register class or a register bank. using RegClassOrRegBank = @@ -107,6 +108,12 @@ class MachineRegisterInfo { VirtReg2IndexFunctor> RegAllocHints; + /// AntiHintRegs - This vector records register anti-hints for + /// virtual registers. For each virtual register, it keeps a vector of virtual + /// registers that should NOT be allocated to the same or overlapping physical + /// registers. + IndexedMap, VirtReg2IndexFunctor> AntiHintRegs; + /// PhysRegUseDefLists - This is an array of the head of the use/def list for /// physical registers. std::unique_ptr PhysRegUseDefLists; @@ -860,6 +867,55 @@ class MachineRegisterInfo { return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr; } + /// setRegAllocationAntiHint - Add a register allocation anti-hint for the + /// specified virtual register. This tells the allocator to avoid allocating + /// VReg to the same physical register as AntiHintVReg (or overlapping ones). + void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) { + assert(VReg.isVirtual() && "Anti-hints are only for virtual registers"); + assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual"); + AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs())); + auto &AntiHints = AntiHintRegs[VReg]; + // Avoid duplicates + if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end()) + AntiHints.push_back(AntiHintVReg); + } + + /// addRegAllocationAntiHint - Add multiple anti-hints at once + void addRegAllocationAntiHints(Register VReg, ArrayRef AntiHintVRegs) { + for (Register AntiHint : AntiHintVRegs) + setRegAllocationAntiHint(VReg, AntiHint); + } + + /// clearRegAllocationAntiHints - Clear all anti-hints for a register + void clearRegAllocationAntiHints(Register VReg) { + assert(VReg.isVirtual()); + if (AntiHintRegs.inBounds(VReg)) + AntiHintRegs[VReg].clear(); + } + + /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg + ArrayRef getRegAllocationAntiHints(Register VReg) const { + assert(VReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg)) + return ArrayRef(); + return AntiHintRegs[VReg]; + } + + /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint + bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const { + assert(VReg.isVirtual() && AntiHintVReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg)) + return false; + const auto &AntiHints = AntiHintRegs[VReg]; + return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end(); + } + + /// getPhysRegAntiHints - Get the set of physical registers to avoid based on + /// anti-hints and current allocations. This is called during allocation. + /// VRM is the current virtual register map showing allocations made so far. + void getPhysRegAntiHints(Register VReg, SmallVectorImpl &PhysAntiHints, + const VirtRegMap *VRM) const; + /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the /// specified register as undefined which causes the DBG_VALUE to be /// deleted during LiveDebugVariables analysis. diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp index 183dc8af1b91b..f57df79128c64 100644 --- a/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/llvm/lib/CodeGen/AllocationOrder.cpp @@ -31,6 +31,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, const LiveRegMatrix *Matrix) { const MachineFunction &MF = VRM.getMachineFunction(); const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg)); SmallVector Hints; bool HardHints = @@ -44,8 +45,69 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, dbgs() << '\n'; } }); - assert(all_of(Hints, - [&](MCPhysReg Hint) { return is_contained(Order, Hint); }) && + + // Get anti-hints + SmallVector AntiHintedPhysRegs; + MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM); + + LLVM_DEBUG({ + if (!AntiHintedPhysRegs.empty()) { + dbgs() << "anti-hints:"; + for (MCPhysReg AntiHint : AntiHintedPhysRegs) + dbgs() << ' ' << printReg(AntiHint, TRI); + dbgs() << '\n'; + } + }); + + // Create allocation order object + AllocationOrder AO(std::move(Hints), Order, HardHints); + + // Apply anti-hint filtering if needed + if (!AntiHintedPhysRegs.empty()) { + AO.applyAntiHints(AntiHintedPhysRegs, TRI); + + LLVM_DEBUG({ + if (!AO.Hints.empty()) { + dbgs() << "filtered hints:"; + for (MCPhysReg Hint : AO.Hints) + dbgs() << ' ' << printReg(Hint, TRI); + dbgs() << '\n'; + } + }); + } + + + assert(all_of(AO.Hints, + [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) && "Target hint is outside allocation order."); - return AllocationOrder(std::move(Hints), Order, HardHints); + return AO; +} + +void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, + const TargetRegisterInfo *TRI) { + // Create filtered order + FilteredOrderStorage.clear(); + FilteredOrderStorage.reserve(Order.size()); + + // Add non-anti-hinted registers first + for (MCPhysReg PhysReg : Order) { + if (!is_contained(AntiHintedPhysRegs, PhysReg)) { + FilteredOrderStorage.push_back(PhysReg); + } + } + + // Add anti-hinted registers at the end as last resort + for (MCPhysReg PhysReg : Order) { + if (is_contained(AntiHintedPhysRegs, PhysReg)) { + FilteredOrderStorage.push_back(PhysReg); + } + } + + // Update Order to point to our filtered storage + Order = FilteredOrderStorage; + + LLVM_DEBUG({ + dbgs() << "moved " << AntiHintedPhysRegs.size() + << " anti-hinted registers to end of allocation order\n"; + }); } diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h index 3dd02c3b14d3a..842f83d957a6d 100644 --- a/llvm/lib/CodeGen/AllocationOrder.h +++ b/llvm/lib/CodeGen/AllocationOrder.h @@ -20,6 +20,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" namespace llvm { @@ -29,6 +30,7 @@ class LiveRegMatrix; class LLVM_LIBRARY_VISIBILITY AllocationOrder { const SmallVector Hints; + SmallVector FilteredOrderStorage; ArrayRef Order; // How far into the Order we can iterate. This is 0 if the AllocationOrder is // constructed with HardHints = true, Order.size() otherwise. While @@ -117,6 +119,11 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder { static_cast(std::numeric_limits::max())); return Reg.isPhysical() && is_contained(Hints, Reg.id()); } + + /// Apply antihint to the allocation order. + void applyAntiHints(ArrayRef AntiHintedPhysRegs, + const TargetRegisterInfo *TRI); + }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 0f792b0ef206c..d63f8040de331 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -735,6 +735,20 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, FlagStringValue.Value + "'"); Info.Flags |= FlagValue; } + + for (const auto &AntiHintValue : VReg.AntiHints) { + if (Info.Kind != VRegInfo::NORMAL) + return error(VReg.Class.SourceRange.Start, + Twine("anti-hints can only be set for normal vregs")); + + Register AntiHintReg; + if (parseRegisterReference(PFS, AntiHintReg, + AntiHintValue.Value, Error)) + return error(Error, AntiHintValue.SourceRange); + + Info.AntiHints.push_back(AntiHintReg); + } + RegInfo.noteNewVirtualRegister(Info.VReg); } @@ -801,6 +815,11 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, MRI.setRegClass(Reg, Info.D.RC); if (Info.PreferredReg != 0) MRI.setSimpleHint(Reg, Info.PreferredReg); + + for (Register AntiHint : Info.AntiHints) { + if (AntiHint != 0) + MRI.setRegAllocationAntiHint(Reg, AntiHint); + } break; case VRegInfo::GENERIC: break; diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index bf8a6cdf097a9..1a88ff279a3c2 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -316,6 +316,17 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF, if (PreferredReg) printRegMIR(PreferredReg, VReg.PreferredRegister, TRI); printRegFlags(Reg, VReg.RegisterFlags, MF, TRI); + // Print the anti-hints. + const auto &AntiHints = RegInfo.getRegAllocationAntiHints(Reg); + if (!AntiHints.empty()) { + std::vector AntiHintStrings; + for (Register AntiHint : AntiHints) { + yaml::FlowStringValue AntiHintStr; + printRegMIR(AntiHint, AntiHintStr, TRI); + AntiHintStrings.push_back(std::move(AntiHintStr)); + } + VReg.AntiHints = std::move(AntiHintStrings); + } YamlMF.VirtualRegisters.push_back(std::move(VReg)); } diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index ae284f3ae2929..bbf03830b3bd5 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DebugLoc.h" @@ -674,3 +675,29 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { } return false; } + +void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, + SmallVectorImpl &PhysAntiHints, + const VirtRegMap *VRM) const { + assert(VReg.isVirtual()); + if (!AntiHintRegs.inBounds(VReg) || !VRM) + return; + + const auto &AntiHints = AntiHintRegs[VReg]; + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + + for (Register AntiHintVReg : AntiHints) { + // Check if the anti-hinted register has been allocated + if (VRM->hasPhys(AntiHintVReg)) { + MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg); + // Add the physical register and all its aliases + for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) { + PhysAntiHints.push_back(*AI); + } + } + } + + // Remove duplicates + llvm::sort(PhysAntiHints); + PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end()); +} \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index ed349fccfa3e4..1a8cd84f7640a 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -323,17 +323,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // Check if MFMA register is dead at current instruction const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); if (!MFMAInterval.liveAt(CurrentSlot)) { - - // Add bidirectional avoidance hint - MFI->addRegisterToAvoid(CandidateReg, MFMAReg); - MFI->addRegisterToAvoid(MFMAReg, CandidateReg); - - // Set hint if we found registers to avoid - MRI->setRegAllocationHint( - MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register()); - MRI->setRegAllocationHint(CandidateReg, - AMDGPURI::HasRegisterAvoidanceList, - Register()); + // Add bidirectional antihints + MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs); + MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg); } } } From e34e16bfdcb85260860c0965babdbc8b60efaa1b Mon Sep 17 00:00:00 2001 From: mssefat Date: Sun, 21 Sep 2025 01:58:40 -0400 Subject: [PATCH 07/20] Made anti-hints map conditional in MIRYamlMapping --- llvm/include/llvm/CodeGen/MIRYamlMapping.h | 6 ++++-- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 24fac0235e960..20cc3c370dc66 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -210,8 +210,10 @@ template <> struct MappingTraits { StringValue()); // Don't print out when it's empty. YamlIO.mapOptional("flags", Reg.RegisterFlags, std::vector()); - YamlIO.mapOptional("anti-hints", Reg.AntiHints, - std::vector()); // For anti-hints. + if(!YamlIO.outputting() || !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present + YamlIO.mapOptional("anti-hints", Reg.AntiHints, + std::vector()); // for anti-hints + } } static const bool flow = true; diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 1a8cd84f7640a..f63eea716d68b 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -48,7 +48,7 @@ static cl::opt EnableRegisterAvoidListForMFMARegs( "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden, cl::desc("Enable Register Avoidance for " "MFMA in GCNPreRAOptimizations stage."), - cl::init(false)); + cl::init(true)); namespace { From 887b556f19b144187e3c30579c8bd838a31e21af Mon Sep 17 00:00:00 2001 From: mssefat Date: Sun, 21 Sep 2025 02:14:53 -0400 Subject: [PATCH 08/20] Updated tests --- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 538 ++++----- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 542 ++++----- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 116 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 120 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 278 +++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 150 +-- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 271 +++-- .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll | 12 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 1071 +++++++++++------ .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 70 +- .../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 170 ++- 11 files changed, 1875 insertions(+), 1463 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index b07dec326327e..3d9be93573ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -15,9 +15,12 @@ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $vgpr132 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 + ; GCN-NEXT: ; implicit-def: $vgpr115 ; GCN-NEXT: ; implicit-def: $vgpr133 ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -167,46 +170,45 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] + ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 + ; GCN-NEXT: v_add_u32_e32 v73, v132, v112 ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v113 + ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 + ; GCN-NEXT: v_add_u32_e32 v72, v132, v115 ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] + ; GCN-NEXT: ; kill: killed $vgpr73 ; GCN-NEXT: ds_read_b128 v[72:75], v94 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ; implicit-def: $sgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr112 + ; GCN-NEXT: ; implicit-def: $vgpr113 + ; GCN-NEXT: ; implicit-def: $vgpr114 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 @@ -411,8 +413,6 @@ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 @@ -440,40 +440,36 @@ ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 + ; GCN-NEXT: ; implicit-def: $vgpr96 ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 ; GCN-NEXT: ; implicit-def: $vgpr65 ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v96 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v112 + ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v113 ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 + ; GCN-NEXT: v_add_u32_e32 v64, v132, v114 ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 + ; GCN-NEXT: v_exp_f32_e32 v165, v57 ; GCN-NEXT: v_exp_f32_e32 v49, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 @@ -499,31 +495,30 @@ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 ; GCN-NEXT: v_exp_f32_e32 v55, v48 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 ; GCN-NEXT: v_exp_f32_e32 v56, v48 ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 + ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 + ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v156, s4, v59, -v134 + ; GCN-NEXT: v_pack_b32_f16 v59, v68, v58 + ; GCN-NEXT: v_pack_b32_f16 v58, v64, v67 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v66 ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] @@ -532,288 +527,287 @@ ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 + ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GCN-NEXT: v_fma_f32 v157, s4, v60, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79] + ; GCN-NEXT: v_exp_f32_e32 v141, v80 ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 + ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 - ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 - ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 - ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 - ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 - ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 + ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 - ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 - ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 - ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 - ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 - ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 + ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 - ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 - ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 - ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95] + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v145, v55 + ; GCN-NEXT: v_exp_f32_e32 v167, v57 + ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GCN-NEXT: v_mul_f32_e32 v168, 0x3fb8aa3b, v157 + ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v148, v56 + ; GCN-NEXT: v_mul_f32_e64 v118, v118, v48 + ; GCN-NEXT: v_mul_f32_e64 v119, v119, v48 + ; GCN-NEXT: v_mul_f32_e64 v120, v120, v48 + ; GCN-NEXT: v_mul_f32_e64 v121, v121, v48 ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v149, v145, v148 + ; GCN-NEXT: v_pack_b32_f16 v148, v140, v144 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v156 + ; GCN-NEXT: v_exp_f32_e32 v168, v168 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v153, v140 + ; GCN-NEXT: ; implicit-def: $vgpr140 + ; GCN-NEXT: v_fma_f32 v164, s4, v61, -v134 + ; GCN-NEXT: v_fma_f32 v166, s4, v62, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v169, v141 ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 + ; GCN-NEXT: v_fma_f32 v57, s4, v35, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79] + ; GCN-NEXT: ds_read_b128 v[142:145], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[156:159], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_fma_f32 v164, s4, v33, -v134 + ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v165 + ; GCN-NEXT: v_exp_f32_e32 v170, v146 + ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v167 ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111] ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v153 + ; GCN-NEXT: v_pack_b32_f16 v62, v169, v147 ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 + ; GCN-NEXT: v_perm_b32 v147, v131, v129, s8 + ; GCN-NEXT: v_pack_b32_f16 v63, v146, v33 ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v148, v33 + ; GCN-NEXT: v_fma_f32 v152, s4, v36, -v134 + ; GCN-NEXT: v_perm_b32 v36, v162, v160, s5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v149, v168 + ; GCN-NEXT: v_cvt_f16_f32_e32 v155, v170 + ; GCN-NEXT: v_perm_b32 v146, v163, v161, s8 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[58:61], v140 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v164 + ; GCN-NEXT: v_exp_f32_e32 v154, v142 + ; GCN-NEXT: v_perm_b32 v142, v162, v160, s8 + ; GCN-NEXT: v_fma_f32 v160, s4, v38, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v157, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v148 + ; GCN-NEXT: v_fma_f32 v156, s4, v37, -v134 ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v143, v130, v128, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111] ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 + ; GCN-NEXT: v_perm_b32 v32, v163, v161, s5 + ; GCN-NEXT: v_exp_f32_e32 v161, v33 ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v136, v[142:143] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v137, v[32:33] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v150, v38 + ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v149, v155 + ; GCN-NEXT: v_exp_f32_e32 v149, v38 ; GCN-NEXT: ; implicit-def: $vgpr33 ; GCN-NEXT: ; implicit-def: $vgpr38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] + ; GCN-NEXT: ds_write_b64 v138, v[146:147] ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr36 ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79] ; GCN-NEXT: ; implicit-def: $vgpr37 ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_exp_f32_e32 v150, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v156, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v154 + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v157 + ; GCN-NEXT: v_fma_f32 v57, s4, v39, -v134 ; GCN-NEXT: ds_read_b128 v[36:39], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 + ; GCN-NEXT: ds_read_b128 v[128:131], v139 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111] + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v161 + ; GCN-NEXT: v_exp_f32_e32 v159, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v149 + ; GCN-NEXT: v_fma_f32 v155, s4, v41, -v134 + ; GCN-NEXT: v_fma_f32 v158, s4, v42, -v134 + ; GCN-NEXT: v_fma_f32 v162, s4, v20, -v134 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127] + ; GCN-NEXT: v_pack_b32_f16 v59, v34, v32 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_pack_b32_f16 v58, v33, v152 + ; GCN-NEXT: v_exp_f32_e32 v60, v32 ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] + ; GCN-NEXT: v_fma_f32 v57, s4, v43, -v134 + ; GCN-NEXT: v_perm_b32 v20, v142, v62, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 + ; GCN-NEXT: v_exp_f32_e32 v61, v36 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v150 + ; GCN-NEXT: v_fma_f32 v155, s4, v46, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95] + ; GCN-NEXT: v_exp_f32_e32 v152, v36 + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v156 + ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_fma_f32 v129, s4, v45, -v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v159 + ; GCN-NEXT: v_exp_f32_e32 v158, v32 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 + ; GCN-NEXT: v_pack_b32_f16 v32, v37, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127] + ; GCN-NEXT: v_exp_f32_e32 v57, v36 ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v61 + ; GCN-NEXT: v_fma_f32 v58, s4, v47, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 + ; GCN-NEXT: ds_read_b128 v[36:39], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v128, v40 + ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v152 + ; GCN-NEXT: ds_read_b128 v[44:47], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95] + ; GCN-NEXT: v_fma_f32 v130, s4, v17, -v134 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v131, v40 + ; GCN-NEXT: v_pack_b32_f16 v40, v59, v129 + ; GCN-NEXT: v_fma_f32 v155, s4, v18, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v128 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v158 + ; GCN-NEXT: v_exp_f32_e32 v160, v17 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v57 ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v58 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 + ; GCN-NEXT: v_fma_f32 v58, s4, v19, -v134 + ; GCN-NEXT: v_exp_f32_e32 v129, v17 + ; GCN-NEXT: v_perm_b32 v42, v143, v63, s8 + ; GCN-NEXT: v_perm_b32 v43, v147, v145, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[32:35], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 + ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_exp_f32_e32 v163, v36 + ; GCN-NEXT: v_perm_b32 v36, v142, v62, s8 ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v160 + ; GCN-NEXT: v_perm_b32 v21, v146, v144, s5 + ; GCN-NEXT: v_perm_b32 v37, v146, v144, s8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v131 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b64 v135, v[20:21] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: v_perm_b32 v16, v143, v63, s5 + ; GCN-NEXT: v_fma_f32 v63, s4, v22, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v129 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_exp_f32_e32 v142, v17 + ; GCN-NEXT: v_perm_b32 v17, v147, v145, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[36:37] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v58 ; GCN-NEXT: v_exp_f32_e32 v144, v22 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -836,22 +830,22 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 + ; GCN-NEXT: v_pack_b32_f16 v32, v59, v44 ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v162 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] ; GCN-NEXT: v_exp_f32_e32 v132, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v163 + ; GCN-NEXT: v_fma_f32 v143, s4, v23, -v134 ; GCN-NEXT: ds_read_b128 v[20:23], v139 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -860,20 +854,20 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v26, -v134 + ; GCN-NEXT: v_fma_f32 v147, s4, v4, -v134 ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v142 ; GCN-NEXT: v_exp_f32_e32 v145, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v143 ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 ; GCN-NEXT: v_exp_f32_e32 v35, v16 ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 @@ -895,11 +889,11 @@ ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v63 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 + ; GCN-NEXT: v_exp_f32_e32 v143, v16 ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v30, -v134 ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] @@ -907,25 +901,25 @@ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 + ; GCN-NEXT: ds_read_b128 v[20:23], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_exp_f32_e32 v36, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[28:31], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v39, v24 ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 + ; GCN-NEXT: v_fma_f32 v63, s4, v2, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v143 + ; GCN-NEXT: v_exp_f32_e32 v146, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 @@ -933,25 +927,25 @@ ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 ; GCN-NEXT: v_exp_f32_e32 v34, v1 ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 + ; GCN-NEXT: v_perm_b32 v27, v59, v45, s8 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[16:19], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 + ; GCN-NEXT: v_exp_f32_e32 v155, v20 ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v146 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] ; GCN-NEXT: v_exp_f32_e32 v38, v21 ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 + ; GCN-NEXT: v_perm_b32 v5, v58, v44, s5 + ; GCN-NEXT: v_perm_b32 v21, v58, v44, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND @@ -961,9 +955,9 @@ ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v63 ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: v_perm_b32 v1, v59, v45, s5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b64 v136, v[20:21] @@ -987,10 +981,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v147 ; GCN-NEXT: v_exp_f32_e32 v26, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v155 ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1042,10 +1036,10 @@ ; GCN-NEXT: v_exp_f32_e32 v21, v9 ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 + ; GCN-NEXT: ds_read_b128 v[4:7], v140 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 + ; GCN-NEXT: ds_read_b128 v[12:15], v140 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -1071,33 +1065,33 @@ ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v168, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v170, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v149, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v60, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v61, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v57, v3 ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v131, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 @@ -1105,14 +1099,14 @@ ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 + ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 @@ -1137,7 +1131,7 @@ ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 @@ -1147,7 +1141,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 + ; GCN-NEXT: ds_read_b128 v[0:3], v140 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..be97a1e82fcf2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -10,25 +10,24 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s20, v2 ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr64 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr76 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; implicit-def: $vgpr49 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr50 ; GCN-NEXT: ; implicit-def: $vgpr77 ; GCN-NEXT: ; implicit-def: $vgpr78 ; GCN-NEXT: ; implicit-def: $vgpr79 ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 + ; GCN-NEXT: ; implicit-def: $vgpr81 + ; GCN-NEXT: ; implicit-def: $vgpr103 ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 + ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v64 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -36,8 +35,9 @@ ; GCN-NEXT: s_lshl_b32 s4, s20, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 + ; GCN-NEXT: v_add_u32_e32 v77, s20, v77 + ; GCN-NEXT: v_and_b32_e32 v77, 0x1fffffff, v77 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: ds_write_b128 v48, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,8 +48,8 @@ ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v76 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v76 ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 @@ -68,22 +68,22 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_mul_lo_u32 v77, v77, s6 + ; GCN-NEXT: v_add_lshl_u32 v77, v78, v77, 1 ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 + ; GCN-NEXT: v_lshl_add_u32 v80, v81, 1, v79 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: ds_read_b128 v[36:39], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[44:47], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 @@ -107,20 +107,20 @@ ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: ds_read_b128 v[68:71], v50 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 + ; GCN-NEXT: v_add_u32_e32 v83, v32, v76 + ; GCN-NEXT: v_add_u32_e32 v76, v33, v76 ; GCN-NEXT: ; kill: killed $vgpr83 + ; GCN-NEXT: ; kill: killed $vgpr76 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[66:69], v50 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -131,20 +131,20 @@ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 + ; GCN-NEXT: v_max_f32_e32 v82, v67, v67 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v77, v70 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v78, v71 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 + ; GCN-NEXT: ds_write_b32 v79, v72 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 @@ -152,11 +152,11 @@ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v21 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 @@ -166,12 +166,12 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v30 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 + ; GCN-NEXT: v_max3_f32 v64, v64, v81, v84 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 @@ -179,315 +179,315 @@ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v7 ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v81 ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v80, v68 + ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $vgpr87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: v_max_f32_e32 v65, v82, v64 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v65 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v65 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v65 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v65 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v65 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v65 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v65 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v16 + ; GCN-NEXT: v_exp_f32_e32 v73, v17 + ; GCN-NEXT: v_exp_f32_e32 v81, v18 + ; GCN-NEXT: v_exp_f32_e32 v82, v19 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v72 + ; GCN-NEXT: v_fma_f32 v17, s4, v24, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v73 + ; GCN-NEXT: v_fma_f32 v19, s4, v25, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v81 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v65 ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v82 + ; GCN-NEXT: v_pack_b32_f16 v24, v16, v18 + ; GCN-NEXT: v_sub_f32_e32 v22, v67, v65 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 + ; GCN-NEXT: v_pack_b32_f16 v25, v20, v21 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v19 + ; GCN-NEXT: ds_read_b128 v[16:19], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v65 + ; GCN-NEXT: v_exp_f32_e32 v86, v23 + ; GCN-NEXT: v_exp_f32_e32 v64, v22 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v16, 0, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v83 + ; GCN-NEXT: v_fma_f32 v88, s4, v28, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v84 + ; GCN-NEXT: v_fma_f32 v91, s4, v29, -v65 + ; GCN-NEXT: v_exp_f32_e32 v92, v21 + ; GCN-NEXT: ds_read_b128 v[20:23], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0] + ; GCN-NEXT: v_perm_b32 v99, v70, v68, s2 + ; GCN-NEXT: v_perm_b32 v100, v70, v68, s3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v93, v73, v16 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v94, v85 + ; GCN-NEXT: v_fma_f32 v95, s4, v30, -v65 + ; GCN-NEXT: v_exp_f32_e32 v96, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v86 + ; GCN-NEXT: v_fma_f32 v98, s4, v31, -v65 + ; GCN-NEXT: v_perm_b32 v101, v71, v69, s2 + ; GCN-NEXT: v_perm_b32 v102, v71, v69, s3 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v77, v99 + ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_pack_b32_f16 v76, v76, v90 + ; GCN-NEXT: v_pack_b32_f16 v77, v94, v97 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v78, v100 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v79, v101 + ; GCN-NEXT: v_mul_f32_e32 v78, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_mul_f32_e32 v79, 0x3fb8aa3b, v91 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v81, v81, v93 + ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v89 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v65 + ; GCN-NEXT: v_exp_f32_e32 v91, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v92 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v65 + ; GCN-NEXT: v_exp_f32_e32 v93, v79 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 + ; GCN-NEXT: ds_write_b32 v80, v102 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v95 + ; GCN-NEXT: v_add_f32_e32 v76, v82, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v96 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v65 + ; GCN-NEXT: v_exp_f32_e32 v80, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v67 + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v98 + ; GCN-NEXT: v_fma_f32 v81, s4, v3, -v65 + ; GCN-NEXT: v_exp_f32_e32 v82, v88 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_pack_b32_f16 v0, v90, v78 + ; GCN-NEXT: v_pack_b32_f16 v1, v77, v79 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v83, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 + ; GCN-NEXT: v_fma_f32 v83, s4, v4, -v65 + ; GCN-NEXT: v_exp_f32_e32 v90, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v93 + ; GCN-NEXT: v_fma_f32 v94, s4, v5, -v65 + ; GCN-NEXT: v_exp_f32_e32 v88, v88 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v68, v84, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v80 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v65 + ; GCN-NEXT: v_exp_f32_e32 v72, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v82 + ; GCN-NEXT: v_pack_b32_f16 v4, v69, v4 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v81 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[0:3], v87 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v73 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v65 + ; GCN-NEXT: v_exp_f32_e32 v73, v69 + ; GCN-NEXT: ds_read_b128 v[76:79], v87 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 + ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v68, v85, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v90 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v65 + ; GCN-NEXT: v_exp_f32_e32 v71, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v88 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v65 + ; GCN-NEXT: v_exp_f32_e32 v81, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47] ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_add_f32_e32 v68, v86, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v72 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v65 + ; GCN-NEXT: v_exp_f32_e32 v74, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v73 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_fma_f32 v75, s4, v11, -v65 + ; GCN-NEXT: v_exp_f32_e32 v83, v7 + ; GCN-NEXT: v_pack_b32_f16 v4, v70, v69 + ; GCN-NEXT: v_pack_b32_f16 v5, v5, v6 + ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v89, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v71 + ; GCN-NEXT: v_fma_f32 v70, s4, v12, -v65 + ; GCN-NEXT: v_exp_f32_e32 v84, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v81 + ; GCN-NEXT: v_fma_f32 v86, s4, v13, -v65 + ; GCN-NEXT: v_exp_f32_e32 v87, v8 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v76, v92, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v74 + ; GCN-NEXT: v_fma_f32 v77, s4, v14, -v65 + ; GCN-NEXT: v_exp_f32_e32 v89, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v83 + ; GCN-NEXT: v_pack_b32_f16 v68, v68, v85 + ; GCN-NEXT: v_mul_f32_e32 v75, 0x3fb8aa3b, v75 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70 + ; GCN-NEXT: v_pack_b32_f16 v69, v69, v92 + ; GCN-NEXT: v_fma_f32 v65, s4, v15, -v65 + ; GCN-NEXT: v_exp_f32_e32 v75, v75 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v76, v96, v76 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v84 + ; GCN-NEXT: v_exp_f32_e32 v92, v70 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v86 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v87 + ; GCN-NEXT: v_exp_f32_e32 v94, v70 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v67, v76 + ; GCN-NEXT: v_add_f32_e32 v67, v91, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v93, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v80, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v82, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v90, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v88, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v72, v67 + ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v77 + ; GCN-NEXT: v_add_f32_e32 v67, v73, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v89 + ; GCN-NEXT: v_exp_f32_e32 v78, v68 + ; GCN-NEXT: v_add_f32_e32 v67, v71, v67 + ; GCN-NEXT: ds_read_b128 v[68:71], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v75 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_add_f32_e32 v67, v81, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v74, v67 + ; GCN-NEXT: v_pack_b32_f16 v77, v76, v77 + ; GCN-NEXT: v_pack_b32_f16 v76, v85, v86 + ; GCN-NEXT: v_add_f32_e32 v67, v83, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v94 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v78 + ; GCN-NEXT: v_cvt_f16_f32_e32 v74, v92 + ; GCN-NEXT: v_add_f32_e32 v67, v84, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v87, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v89, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v75, v67 + ; GCN-NEXT: v_pack_b32_f16 v69, v68, v72 + ; GCN-NEXT: v_pack_b32_f16 v68, v74, v73 + ; GCN-NEXT: ds_read_b128 v[72:75], v103 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v67, v92, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v94, v67 + ; GCN-NEXT: v_add_f32_e32 v67, v78, v67 + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v67, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v65, v65, v67 + ; GCN-NEXT: ds_bpermute_b32 v66, v66, v65 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mov_b32_e32 v67, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_cndmask_b32_e64 v65, v66, v65, s[0:1] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63] + ; GCN-NEXT: v_fmac_f32_e32 v65, v67, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index ed3d1399e5926..17692a38dfc64 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 4 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -647,37 +647,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX90A-VGPR: ; %bb.0: ; %bb ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 10 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits: @@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[2:3] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0) @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm: @@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit: @@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 2fb677eccc4b3..07a4f33f25b17 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -2460,6 +2460,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -2480,12 +2481,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_nop 10 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: @@ -2525,6 +2525,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -2545,12 +2546,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_nop 11 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: @@ -3607,6 +3607,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3627,12 +3628,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_nop 10 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: @@ -3672,6 +3672,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3692,12 +3693,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_nop 11 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: @@ -3910,6 +3910,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3930,12 +3931,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_nop 10 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: @@ -3975,6 +3975,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3995,12 +3996,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_nop 11 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: @@ -4213,6 +4213,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4233,12 +4234,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_nop 10 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: @@ -4278,6 +4278,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4298,12 +4299,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_nop 11 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: @@ -4516,6 +4516,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4536,12 +4537,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 9 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_nop 10 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: @@ -4581,6 +4581,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4601,12 +4602,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-SDAG-NEXT: s_nop 10 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_nop 11 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..eefd7b5fea63e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -3182,18 +3182,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] -; VGPRRC-NEXT: s_nop 11 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: s_nop 9 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3214,14 +3212,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8: @@ -3594,18 +3592,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1 -; VGPRRC-NEXT: s_nop 11 +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16 +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0 +; VGPRRC-NEXT: s_nop 9 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 @@ -3626,14 +3622,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 -; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: @@ -4146,33 +4142,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4256,33 +4251,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4320,33 +4314,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -4523,33 +4516,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4633,33 +4625,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -4697,33 +4688,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: s_nop 6 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 7e30af96bb8b9..aa670dce4e6f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -1155,17 +1155,17 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 9 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg @@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942-VGPR: ; %bb.0: -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3) @@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) % ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 @@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[62:63], v[30:31] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v64, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[60:61], v[28:29] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[58:59], v[26:27] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[56:57], v[24:25] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[54:55], v[22:23] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[52:53], v[20:21] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[50:51], v[18:19] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[48:49], v[16:17] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[46:47], v[14:15] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[44:45], v[12:13] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[42:43], v[10:11] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[40:41], v[8:9] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[38:39], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[36:37], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[34:35], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[0:1] ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[60:63], s[0:1] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[56:59], s[0:1] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[52:55], s[0:1] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[48:51], s[0:1] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[44:47], s[0:1] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[40:43], s[0:1] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[36:39], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[32:35], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..a8d2f64c3c4d9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -5093,43 +5093,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b64_e32 v[36:37], 48 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,6 +5136,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] @@ -5154,28 +5156,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5190,71 +5197,71 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: v_mov_b32_e32 v32, 42 ; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: v_mov_b64_e32 v[36:37], 48 +; SDAG-NEXT: v_mov_b64_e32 v[38:39], 32 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 +; SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5264,52 +5271,52 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GISEL-NEXT: v_mov_b32_e32 v32, 25 ; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16 +; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32 +; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd: @@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0 ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-GISEL-NEXT: s_nop 5 -; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-GISEL-NEXT: s_nop 6 +; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index ee11b9295a24a..9b9d11502d413 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -245,24 +245,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: @@ -307,24 +324,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: @@ -369,24 +403,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: @@ -672,24 +723,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -699,24 +767,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result @@ -726,24 +811,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NEXT: v_mov_b32_e32 v3, v15 -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: v_mov_b32_e32 v5, v17 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v20 -; GCN-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NEXT: v_mov_b32_e32 v10, v22 -; GCN-NEXT: v_mov_b32_e32 v11, v23 -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: v_mov_b32_e32 v13, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v26 -; GCN-NEXT: v_mov_b32_e32 v15, v27 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result @@ -1042,24 +1144,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: @@ -1104,24 +1223,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: @@ -1166,24 +1302,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: @@ -2049,24 +2202,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: @@ -2111,24 +2281,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: @@ -2173,24 +2360,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: @@ -2400,24 +2604,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: @@ -2462,24 +2683,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: @@ -2524,24 +2762,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: @@ -2751,24 +3006,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: @@ -2813,24 +3085,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: @@ -2875,24 +3164,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: @@ -3102,24 +3408,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: @@ -3164,24 +3487,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: @@ -3226,24 +3566,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_mov_b32_e32 v0, v12 -; SDAG-NEXT: v_mov_b32_e32 v1, v13 -; SDAG-NEXT: v_mov_b32_e32 v2, v14 -; SDAG-NEXT: v_mov_b32_e32 v3, v15 -; SDAG-NEXT: v_mov_b32_e32 v4, v16 -; SDAG-NEXT: v_mov_b32_e32 v5, v17 -; SDAG-NEXT: v_mov_b32_e32 v6, v18 -; SDAG-NEXT: v_mov_b32_e32 v7, v19 -; SDAG-NEXT: v_mov_b32_e32 v8, v20 -; SDAG-NEXT: v_mov_b32_e32 v9, v21 -; SDAG-NEXT: v_mov_b32_e32 v10, v22 -; SDAG-NEXT: v_mov_b32_e32 v11, v23 -; SDAG-NEXT: v_mov_b32_e32 v12, v24 -; SDAG-NEXT: v_mov_b32_e32 v13, v25 -; SDAG-NEXT: v_mov_b32_e32 v14, v26 -; SDAG-NEXT: v_mov_b32_e32 v15, v27 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 9a23788f8855a..ac88f8d550f9c 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b32_e32 v22, 0x7fc00000 ; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 @@ -507,13 +507,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5] ; CHECK-NEXT: s_nop 3 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0] ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -635,46 +635,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] -; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; CHECK-NEXT: global_store_dwordx4 v32, a[24:27], s[2:3] offset:96 +; CHECK-NEXT: global_store_dwordx4 v32, a[28:31], s[2:3] offset:112 +; CHECK-NEXT: global_store_dwordx4 v32, a[16:19], s[2:3] offset:64 +; CHECK-NEXT: global_store_dwordx4 v32, a[20:23], s[2:3] offset:80 +; CHECK-NEXT: global_store_dwordx4 v32, a[8:11], s[2:3] offset:32 +; CHECK-NEXT: global_store_dwordx4 v32, a[12:15], s[2:3] offset:48 +; CHECK-NEXT: global_store_dwordx4 v32, a[0:3], s[2:3] +; CHECK-NEXT: global_store_dwordx4 v32, a[4:7], s[2:3] offset:16 ; CHECK-NEXT: s_endpgm %src2 = call <32 x float> asm sideeffect "; def $0", "=a"() %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0) @@ -756,15 +724,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v31 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] -; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] +; CHECK-NEXT: s_nop 3 ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] -; CHECK-NEXT: s_nop 8 -; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 3, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5] +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: global_store_dwordx2 v[4:5], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %src2 = call double asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index a81d9a458e23a..08f89b32edb20 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -101,8 +101,13 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:9] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] @@ -112,37 +117,75 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: v_mov_b32_e32 v60, 0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a56 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a57 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a58 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a59 +; CHECK-NEXT: global_store_dwordx4 v60, v[56:59], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[52:55], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a33 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a34 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a35 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a36 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a37 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a38 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a39 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a40 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a41 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a42 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a43 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a44 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a45 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a46 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a47 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a48 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a49 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a50 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a51 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a52 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a53 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a54 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a55 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a60 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a61 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a62 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a63 +; CHECK-NEXT: global_store_dwordx4 v60, v[24:27], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v60, v[28:31], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[16:19], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[20:23], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[8:11], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[12:15], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[4:7], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -301,16 +344,26 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[8:11] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[12:15] +; CHECK-NEXT: ; def v[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND @@ -319,39 +372,82 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: v_mov_b32_e32 v60, 0 +; CHECK-NEXT: v_accvgpr_read_b32 v24, a56 +; CHECK-NEXT: v_accvgpr_read_b32 v25, a57 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a58 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a59 +; CHECK-NEXT: global_store_dwordx4 v60, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v60, v[48:51], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[44:47], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[40:43], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[32:35], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a33 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a34 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a35 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a36 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a37 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a38 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a39 +; CHECK-NEXT: v_accvgpr_read_b32 v8, a40 +; CHECK-NEXT: v_accvgpr_read_b32 v9, a41 +; CHECK-NEXT: v_accvgpr_read_b32 v10, a42 +; CHECK-NEXT: v_accvgpr_read_b32 v11, a43 +; CHECK-NEXT: v_accvgpr_read_b32 v12, a44 +; CHECK-NEXT: v_accvgpr_read_b32 v13, a45 +; CHECK-NEXT: v_accvgpr_read_b32 v14, a46 +; CHECK-NEXT: v_accvgpr_read_b32 v15, a47 +; CHECK-NEXT: v_accvgpr_read_b32 v16, a48 +; CHECK-NEXT: v_accvgpr_read_b32 v17, a49 +; CHECK-NEXT: v_accvgpr_read_b32 v18, a50 +; CHECK-NEXT: v_accvgpr_read_b32 v19, a51 +; CHECK-NEXT: v_accvgpr_read_b32 v20, a52 +; CHECK-NEXT: v_accvgpr_read_b32 v21, a53 +; CHECK-NEXT: v_accvgpr_read_b32 v22, a54 +; CHECK-NEXT: v_accvgpr_read_b32 v23, a55 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a60 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a61 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a62 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a63 +; CHECK-NEXT: global_store_dwordx4 v60, v[24:27], s[16:17] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[28:31], s[16:17] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: global_store_dwordx4 v60, v[16:19], s[16:17] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: global_store_dwordx4 v60, v[20:23], s[16:17] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: global_store_dwordx4 v60, v[8:11], s[16:17] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: global_store_dwordx4 v60, v[12:15], s[16:17] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: global_store_dwordx4 v60, v[4:7], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v60, v[0:3], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload From 19d711992615cfe578c90b001bb22c6d497e7029 Mon Sep 17 00:00:00 2001 From: mssefat Date: Sun, 21 Sep 2025 02:25:04 -0400 Subject: [PATCH 09/20] Updated mir test --- ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1443 +++-------------- 1 file changed, 195 insertions(+), 1248 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir index 271b36fad2bb4..97305f2c8a8f0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s --- | target triple = "amdgcn-amd-amdhsa" @@ -17,855 +17,153 @@ name: test_software_pipelining body: | bb.0: - ; GFX942_WITHOUT-LABEL: name: test_software_pipelining - ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279 - ; GFX942_WITHOUT-NEXT: S_BARRIER - ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0 - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF - ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279 - ; GFX942_WITHOUT-NEXT: S_BARRIER - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0 - ; GFX942_WITHOUT-NEXT: S_ENDPGM 0 + ; CHECK-LABEL: name: test_software_pipelining + ; CHECK: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr72 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr68 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr73 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr74 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr74, implicit $exec + ; CHECK-NEXT: renamable $vgpr75 = V_ADD_U32_e32 $vgpr68, killed $vgpr52, implicit $exec + ; CHECK-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr75, implicit $exec + ; CHECK-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr69 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr68 = V_ADD_U32_e32 killed $vgpr68, killed $vgpr69, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr74, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr75, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr72, killed renamable $vgpr68_vgpr69_vgpr70_vgpr71, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr72, killed renamable $vgpr48_vgpr49_vgpr50_vgpr51, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 killed renamable $vgpr73, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr64_vgpr65, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr74, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; - ; GFX942_WITH-LABEL: name: test_software_pipelining - ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: S_WAITCNT 49279 - ; GFX942_WITH-NEXT: S_BARRIER - ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_BARRIER 0 - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF - ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3) - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: S_WAITCNT 49279 - ; GFX942_WITH-NEXT: S_BARRIER - ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3) - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0 - ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0 - ; GFX942_WITH-NEXT: SCHED_BARRIER 0 - ; GFX942_WITH-NEXT: S_ENDPGM 0 + ; CHECK-NO-ANTIHINT-LABEL: name: test_software_pipelining + ; CHECK-NO-ANTIHINT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr68 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr69 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr70 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr71 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr71, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr72 = V_ADD_U32_e32 $vgpr69, killed $vgpr52, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr72, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr70, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr70, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36 = V_ADD_U32_e32 killed $vgpr69, killed $vgpr36, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr71, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr70, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr72, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr70, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr68, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = DS_READ_B128_gfx9 renamable $vgpr70, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF + ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr68, killed renamable $vgpr14_vgpr15_vgpr16_vgpr17, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 killed renamable $vgpr70, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr12_vgpr13, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr71, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF @@ -890,403 +188,52 @@ body: | %21:vreg_128_align2 = IMPLICIT_DEF %22:vreg_128_align2 = IMPLICIT_DEF %23:vreg_128_align2 = IMPLICIT_DEF - %25:vgpr_32 = IMPLICIT_DEF - %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec - %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec - %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec - %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %31:vreg_128_align2 = IMPLICIT_DEF - %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %37:vreg_128_align2 = IMPLICIT_DEF - %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %24:vgpr_32 = IMPLICIT_DEF + %25:vgpr_32 = V_ADD_U32_e32 4096, %24, implicit $exec + %26:vgpr_32 = V_ADD_U32_e32 %3, %7, implicit $exec + %27:vgpr_32 = V_ADD_U32_e32 4096, %26, implicit $exec + %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %30:vreg_128_align2 = IMPLICIT_DEF + %31:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %29.sub0_sub1, %23, 0, 0, 0, implicit $mode, implicit $exec + %32:vreg_128_align2 = DS_READ_B128_gfx9 %4, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %29.sub2_sub3, %31, 0, 0, 0, implicit $mode, implicit $exec + %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %28.sub0_sub1, %22, 0, 0, 0, implicit $mode, implicit $exec + %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %28.sub2_sub3, %34, 0, 0, 0, implicit $mode, implicit $exec + %36:vreg_128_align2 = IMPLICIT_DEF + %37:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %29.sub0_sub1, %21, 0, 0, 0, implicit $mode, implicit $exec + %38:vreg_128_align2 = DS_READ_B128_gfx9 %4, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %29.sub2_sub3, %37, 0, 0, 0, implicit $mode, implicit $exec + %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %28.sub0_sub1, %20, 0, 0, 0, implicit $mode, implicit $exec + %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %28.sub2_sub3, %40, 0, 0, 0, implicit $mode, implicit $exec + %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %29.sub0_sub1, %19, 0, 0, 0, implicit $mode, implicit $exec %43:vgpr_32 = IMPLICIT_DEF - %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec - %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %56:vreg_128_align2 = IMPLICIT_DEF - DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3) - %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %62:vreg_128_align2 = IMPLICIT_DEF - DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3) - %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3) - %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %79:vgpr_32 = IMPLICIT_DEF - %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - %81:vreg_128_align2 = IMPLICIT_DEF - %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %85:vreg_128_align2 = IMPLICIT_DEF - %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %94:vgpr_32 = IMPLICIT_DEF - %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec - %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3) - %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %102:vgpr_32 = IMPLICIT_DEF - %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec - %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3) - %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %109:vreg_128_align2 = IMPLICIT_DEF - DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3) - %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3) - %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %115:vreg_128_align2 = IMPLICIT_DEF - DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3) - %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3) - %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %122:vgpr_32 = IMPLICIT_DEF - %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec - %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %127:vgpr_32 = IMPLICIT_DEF - %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec - %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - S_WAITCNT 49279 - S_BARRIER - %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_BARRIER 0 - %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3) - %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3) - %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3) - %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %176:vgpr_32 = IMPLICIT_DEF - %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3) - %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3) - %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3) - %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3) - %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3) - %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3) - %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3) - %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3) - %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3) - %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - S_WAITCNT 49279 - S_BARRIER - %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3) - %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3) - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 512, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 32, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 8, 1, 0 - SCHED_GROUP_BARRIER 256, 1, 0 - SCHED_BARRIER 0 - S_ENDPGM 0 + %44:vgpr_32 = V_ADD_U32_e32 %3, %43, implicit $exec + %45:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %29.sub2_sub3, %42, 0, 0, 0, implicit $mode, implicit $exec + %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %28.sub0_sub1, %18, 0, 0, 0, implicit $mode, implicit $exec + %48:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %28.sub2_sub3, %47, 0, 0, 0, implicit $mode, implicit $exec + %49:vreg_128_align2 = DS_READ_B128_gfx9 %4, 8192, 0, implicit $exec :: (load (s128), addrspace 3) + %50:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %29.sub0_sub1, %17, 0, 0, 0, implicit $mode, implicit $exec + %51:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %29.sub2_sub3, %50, 0, 0, 0, implicit $mode, implicit $exec + %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %28.sub0_sub1, %16, 0, 0, 0, implicit $mode, implicit $exec + %54:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %28.sub2_sub3, %53, 0, 0, 0, implicit $mode, implicit $exec + %55:vreg_128_align2 = DS_READ_B128_gfx9 %4, 10240, 0, implicit $exec :: (load (s128), addrspace 3) + %56:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %29.sub0_sub1, %15, 0, 0, 0, implicit $mode, implicit $exec + %57:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1, %57, 16384, 0, implicit $exec :: (store (s128), addrspace 3) + %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %29.sub2_sub3, %56, 0, 0, 0, implicit $mode, implicit $exec + %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %28.sub0_sub1, %14, 0, 0, 0, implicit $mode, implicit $exec + %60:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %28.sub2_sub3, %59, 0, 0, 0, implicit $mode, implicit $exec + %61:vreg_128_align2 = DS_READ_B128_gfx9 %4, 12288, 0, implicit $exec :: (load (s128), addrspace 3) + %62:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %29.sub0_sub1, %13, 0, 0, 0, implicit $mode, implicit $exec + %63:vreg_128_align2 = IMPLICIT_DEF + DS_WRITE_B128_gfx9 %1, %63, 20480, 0, implicit $exec :: (store (s128), addrspace 3) + %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %29.sub2_sub3, %62, 0, 0, 0, implicit $mode, implicit $exec + %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %28.sub0_sub1, %12, 0, 0, 0, implicit $mode, implicit $exec + %66:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %28.sub2_sub3, %65, 0, 0, 0, implicit $mode, implicit $exec + %67:vreg_128_align2 = DS_READ_B128_gfx9 %4, 14336, 0, implicit $exec :: (load (s128), addrspace 3) + %68:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %61.sub0_sub1, %29.sub0_sub1, %11, 0, 0, 0, implicit $mode, implicit $exec + %69:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ... From cd0f88dde7ea41bec87df3e093e1df3e0bf9ff22 Mon Sep 17 00:00:00 2001 From: mssefat Date: Sun, 21 Sep 2025 02:33:00 -0400 Subject: [PATCH 10/20] Renamed test file --- ....barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/CodeGen/AMDGPU/{llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir rename to llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir From 7a8d17e95ab02a7c415143269ca0238f82b37432 Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 14:42:41 -0400 Subject: [PATCH 11/20] Added print and parse tests --- ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 195 ++++++++++++++++++ ...vm.amdgcn.mfma.anti-hints-print.gfx942.mir | 126 +++++++++++ .../llvm.amdgcn.mfma.anti-hints.gfx942.mir | 4 +- 3 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir new file mode 100644 index 0000000000000..905fff8b642cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir @@ -0,0 +1,195 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s +--- | + ; ModuleID = '/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir' + source_filename = "/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir" + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + target triple = "amdgcn-amd-amdhsa" + + ; Function Attrs: nounwind + define amdgpu_kernel void @test_software_pipelining() #0 { + bb.0: + ret void + } + + attributes #0 = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="2" "frame-pointer"="none" "target-cpu"="gfx942" } +... +--- +name: test_software_pipelining +registers: + - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%27', + '%4', + '%26', + '%25', + '%5', + '%24', + '%22', + '%6', + '%20', + '%19', + '%7', + '%18', + '%16', + '%8' ] } + - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%16', + '%8', + '%22', + '%6', + '%20', + '%19', + '%7', + '%18' ] } + - { id: 2, class: sgpr_128, preferred-register: '', flags: [ ] } + - { id: 3, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } + - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } + - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%27', + '%4', + '%26', + '%25', + '%5', + '%24', + '%22', + '%6', + '%20', + '%19', + '%7', + '%18', + '%16', + '%8' ] } + - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 11, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 12, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%16', '%8' ] } + - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } + - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } + - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] } + - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } + - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [ ] } + - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%27', + '%4', + '%26', + '%25', + '%5', + '%24', + '%22', + '%6', + '%20', + '%19', + '%7', + '%18', + '%16', + '%8' ] } + - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%27', + '%4', + '%26', + '%25', + '%5', + '%24', + '%22', + '%6', + '%20', + '%19', + '%7', + '%18', + '%16', + '%8' ] } + - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6', + '%20', '%19', '%7', '%18', '%16', '%8' ] } +body: | + bb.0: + ; CHECK-LABEL: name: test_software_pipelining + ; CHECK: renamable $vgpr36 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr37 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr38 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr24 = V_ADD_U32_e32 4096, $vgpr38, implicit $exec + ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 $vgpr36, killed $vgpr20, implicit $exec + ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 4096, killed $vgpr20, implicit $exec + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr20, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr24, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr24_vgpr25, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128_gfx9 renamable $vgpr37, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr26_vgpr27, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr20_vgpr21, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr22_vgpr23, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr24_vgpr25, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 killed renamable $vgpr37, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr26_vgpr27, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr20_vgpr21, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, killed $vgpr22_vgpr23, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr32_vgpr33, killed $vgpr24_vgpr25, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr20 = V_ADD_U32_e32 killed $vgpr36, killed $vgpr20, implicit $exec + ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr38, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr28_vgpr29_vgpr30_vgpr31, implicit killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, implicit killed renamable $vgpr12_vgpr13_vgpr14_vgpr15, implicit killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed renamable $vgpr20_vgpr21_vgpr22_vgpr23 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sgpr_128 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vreg_128_align2 = IMPLICIT_DEF + %5:vreg_128_align2 = IMPLICIT_DEF + %6:vreg_128_align2 = IMPLICIT_DEF + %7:vreg_128_align2 = IMPLICIT_DEF + %8:vreg_128_align2 = IMPLICIT_DEF + %9:vgpr_32 = IMPLICIT_DEF + %10:vgpr_32 = V_ADD_U32_e32 4096, %9, implicit $exec + %11:vgpr_32 = V_ADD_U32_e32 %0, %3, implicit $exec + %12:vgpr_32 = V_ADD_U32_e32 4096, %11, implicit $exec + %13:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %12, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %14:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %10, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %15:vreg_128_align2 = IMPLICIT_DEF + %16:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %14.sub0_sub1, %8, 0, 0, 0, implicit $mode, implicit $exec + %17:vreg_128_align2 = DS_READ_B128_gfx9 %1, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + dead %18:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %14.sub2_sub3, %16, 0, 0, 0, implicit $mode, implicit $exec + %19:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %13.sub0_sub1, %7, 0, 0, 0, implicit $mode, implicit $exec + %20:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %13.sub2_sub3, %19, 0, 0, 0, implicit $mode, implicit $exec + %21:vreg_128_align2 = IMPLICIT_DEF + %22:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %14.sub0_sub1, %6, 0, 0, 0, implicit $mode, implicit $exec + %23:vreg_128_align2 = DS_READ_B128_gfx9 %1, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %24:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %14.sub2_sub3, %22, 0, 0, 0, implicit $mode, implicit $exec + %25:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %13.sub0_sub1, %5, 0, 0, 0, implicit $mode, implicit $exec + %26:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %13.sub2_sub3, %25, 0, 0, 0, implicit $mode, implicit $exec + %27:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %17.sub0_sub1, %14.sub0_sub1, %4, 0, 0, 0, implicit $mode, implicit $exec + %28:vgpr_32 = IMPLICIT_DEF + dead %29:vgpr_32 = V_ADD_U32_e32 %0, %28, implicit $exec + %30:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %9, %2, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + S_ENDPGM 0, implicit %23, implicit %24, implicit %20, implicit %26, implicit %27, implicit %30 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir new file mode 100644 index 0000000000000..d55dbb4ea0e5f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir @@ -0,0 +1,126 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=CHECK %s + +--- | + target triple = "amdgcn-amd-amdhsa" + + define amdgpu_kernel void @test_software_pipelining() #0 { + bb.0: + ret void + } + + attributes #0 = {nounwind "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" "frame-pointer"="none"} + +... +--- +name: test_software_pipelining +body: | + bb.0: + ; CHECK-LABEL: name: test_software_pipelining + ; CHECK: registers: + ; CHECK: - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK:{{\s*}}'%8' ] } + ; CHECK: - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + ; CHECK-NEXT:{{\s*}}'%16', + ; CHECK:{{\s*}}'%18' ] } + ; CHECK: - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } + ; CHECK: - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } + ; CHECK: - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } + ; CHECK: - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } + ; CHECK: - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] } + ; CHECK: - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK:{{\s*}}'%8' ] } + ; CHECK: - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } + ; CHECK: - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] } + ; CHECK: - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } + ; CHECK: - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } + ; CHECK: - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } + ; CHECK: - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] } + ; CHECK: - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } + ; CHECK: - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK:{{\s*}}'%8' ] } + ; CHECK: - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK:{{\s*}}'%8' ] } + ; CHECK: - { id: 30, class: vreg_128_align2 + ; CHECK-NEXT: {{.*}}anti-hints: [ '%27' + ; CHECK: {{.*}}'%8' ] } + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[DEF9]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[V_ADD_U32_e32_1]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_2]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF8]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 4096, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: dead [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF7]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 6144, 0, implicit $exec :: (load (s128), addrspace 3) + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF5]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF4]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF12]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF9]], [[DEF2]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[DS_READ_B128_gfx9_1]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], implicit [[BUFFER_LOAD_DWORDX4_OFFEN2]] + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = IMPLICIT_DEF + %6:sgpr_128 = IMPLICIT_DEF + %7:vgpr_32 = IMPLICIT_DEF + %19:vreg_128_align2 = IMPLICIT_DEF + %20:vreg_128_align2 = IMPLICIT_DEF + %21:vreg_128_align2 = IMPLICIT_DEF + %22:vreg_128_align2 = IMPLICIT_DEF + %23:vreg_128_align2 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec + %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec + %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec + %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + %31:vreg_128_align2 = IMPLICIT_DEF + %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) + %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %37:vreg_128_align2 = IMPLICIT_DEF + %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) + %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %43:vgpr_32 = IMPLICIT_DEF + %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec + %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + S_ENDPGM 0, implicit %38, implicit %39, implicit %35, implicit %41, implicit %42, implicit %44 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir index 97305f2c8a8f0..d360eccaeb773 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT --- | target triple = "amdgcn-amd-amdhsa" From 3a84b9b1d5f8f6f203c6716ebceb68d9c264fee6 Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 15:34:06 -0400 Subject: [PATCH 12/20] Fixed typo --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index f63eea716d68b..e0eecb06e2d32 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -323,7 +323,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // Check if MFMA register is dead at current instruction const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); if (!MFMAInterval.liveAt(CurrentSlot)) { - // Add bidirectional antihints + // Add bi-directional anti-hints MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs); MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg); } From b893331d19c7442e7d09da4760f02dabaf61d002 Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 18:02:20 -0400 Subject: [PATCH 13/20] Fixed typo --- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index e0eecb06e2d32..098ca1120c85c 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -324,7 +324,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); if (!MFMAInterval.liveAt(CurrentSlot)) { // Add bi-directional anti-hints - MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs); + MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg); MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg); } } From f04eb481c2af633a0cfb0c0b37769694815fba60 Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 18:28:30 -0400 Subject: [PATCH 14/20] Fixed test! --- ...lvm.amdgcn.mfma.anti-hints-print.gfx942.mir | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir index d55dbb4ea0e5f..c6de026d447fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir @@ -18,11 +18,11 @@ body: | ; CHECK-LABEL: name: test_software_pipelining ; CHECK: registers: ; CHECK: - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK-NEXT:{{\s*}}'%4', ; CHECK:{{\s*}}'%8' ] } ; CHECK: - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%16', - ; CHECK:{{\s*}}'%18' ] } + ; CHECK-NEXT:{{\s*}}'%8', + ; CHECK:{{\s*}}'%16' ] } ; CHECK: - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } ; CHECK: - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], @@ -34,12 +34,12 @@ body: | ; CHECK: - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] } ; CHECK: - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK-NEXT:{{\s*}}'%4', ; CHECK:{{\s*}}'%8' ] } ; CHECK: - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } ; CHECK: - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] } + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%8' ] } ; CHECK: - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } ; CHECK: - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], @@ -47,17 +47,17 @@ body: | ; CHECK: - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } ; CHECK: - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] } + ; CHECK-NEXT:{{\s*}}anti-hints: [ '%6'{{.*}}'%8' ] } ; CHECK: - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } ; CHECK: - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK-NEXT:{{\s*}}'%4', ; CHECK:{{\s*}}'%8' ] } ; CHECK: - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%27', + ; CHECK-NEXT:{{\s*}}'%4', ; CHECK:{{\s*}}'%8' ] } ; CHECK: - { id: 30, class: vreg_128_align2 - ; CHECK-NEXT: {{.*}}anti-hints: [ '%27' + ; CHECK-NEXT: {{.*}}anti-hints: [ '%4' ; CHECK: {{.*}}'%8' ] } ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF From e82a0e2d69d99a5da1d96393bf4ef2c1552fb348 Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 18:33:12 -0400 Subject: [PATCH 15/20] Fixed test --- ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 146 ++++++++---------- 1 file changed, 64 insertions(+), 82 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir index 905fff8b642cc..89ac0978a0f72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir @@ -16,56 +16,46 @@ --- name: test_software_pipelining registers: - - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%27', - '%4', - '%26', - '%25', - '%5', - '%24', - '%22', - '%6', - '%20', - '%19', - '%7', - '%18', - '%16', + - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%4', + '%25', + '%5', + '%22', + '%6', + '%19', + '%7', + '%18', + '%16', '%8' ] } - - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%16', - '%8', - '%22', - '%6', - '%20', - '%19', - '%7', - '%18' ] } + - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%8', + '%6', + '%19', + '%7', + '%18', + '%16' ] } - { id: 2, class: sgpr_128, preferred-register: '', flags: [ ] } - { id: 3, class: vgpr_32, preferred-register: '', flags: [ ] } - - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%27', - '%4', - '%26', - '%25', - '%5', - '%24', - '%22', - '%6', - '%20', - '%19', - '%7', - '%18', - '%16', + - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%4', + '%25', + '%5', + '%22', + '%6', + '%19', + '%7', + '%18', + '%16', '%8' ] } - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] } - { id: 11, class: vgpr_32, preferred-register: '', flags: [ ] } @@ -73,58 +63,50 @@ registers: - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [ ] } - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [ ] } - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%16', '%8' ] } - - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%8' ] } + - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [ ] } - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] } + - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%6', '%19', '%7', '%18', '%16', '%8' ] } - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], + - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [ ] } - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%27', - '%4', - '%26', - '%25', - '%5', - '%24', - '%22', - '%6', - '%20', - '%19', - '%7', - '%18', - '%16', + - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%4', + '%25', + '%5', + '%22', + '%6', + '%19', + '%7', + '%18', + '%16', '%8' ] } - - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%27', - '%4', - '%26', - '%25', - '%5', - '%24', - '%22', - '%6', - '%20', - '%19', - '%7', - '%18', - '%16', + - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ + '%4', + '%25', + '%5', + '%22', + '%6', + '%19', + '%7', + '%18', + '%16', '%8' ] } - - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6', - '%20', '%19', '%7', '%18', '%16', '%8' ] } + - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [ ], + anti-hints: [ '%4', '%25', '%5', '%22', '%6', '%19', '%7', '%18', + '%16', '%8' ] } body: | bb.0: ; CHECK-LABEL: name: test_software_pipelining From a9b05ec6474dae9634cbdd339f29ce71a5e7ba3d Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 19:10:29 -0400 Subject: [PATCH 16/20] Fixed typo --- llvm/lib/CodeGen/AllocationOrder.cpp | 4 ++-- llvm/lib/CodeGen/AllocationOrder.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp index f57df79128c64..f420c96e212d0 100644 --- a/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/llvm/lib/CodeGen/AllocationOrder.cpp @@ -62,7 +62,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, // Create allocation order object AllocationOrder AO(std::move(Hints), Order, HardHints); - // Apply anti-hint filtering if needed + // Apply anti-hints filtering if needed if (!AntiHintedPhysRegs.empty()) { AO.applyAntiHints(AntiHintedPhysRegs, TRI); @@ -103,7 +103,7 @@ void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, } } - // Update Order to point to our filtered storage + // Update Order Order = FilteredOrderStorage; LLVM_DEBUG({ diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h index 842f83d957a6d..029d9c83baf35 100644 --- a/llvm/lib/CodeGen/AllocationOrder.h +++ b/llvm/lib/CodeGen/AllocationOrder.h @@ -120,7 +120,7 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder { return Reg.isPhysical() && is_contained(Hints, Reg.id()); } - /// Apply antihint to the allocation order. + /// Apply anti-hints to the allocation order. void applyAntiHints(ArrayRef AntiHintedPhysRegs, const TargetRegisterInfo *TRI); From c7a41408139fdaa85f7939508f6c8ca5e3c2c4db Mon Sep 17 00:00:00 2001 From: mssefat Date: Mon, 22 Sep 2025 19:31:15 -0400 Subject: [PATCH 17/20] [AMDGPU] Anti-hints in register allocation --- .../include/llvm/CodeGen/MIRParser/MIParser.h | 2 +- llvm/include/llvm/CodeGen/MIRYamlMapping.h | 5 +++-- .../llvm/CodeGen/MachineRegisterInfo.h | 19 +++++++++-------- llvm/lib/CodeGen/AllocationOrder.cpp | 21 +++++++++---------- llvm/lib/CodeGen/AllocationOrder.h | 5 ++--- llvm/lib/CodeGen/MIRParser/MIRParser.cpp | 7 +++---- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 12 +++++------ 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h index 1d0a745d5f983..cf7a56587397d 100644 --- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h +++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h @@ -45,7 +45,7 @@ struct VRegInfo { } D; Register VReg; Register PreferredReg; - SmallVector AntiHints; // Anti-hints + SmallVector AntiHints; // Anti-hints uint8_t Flags = 0; }; diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 20cc3c370dc66..9c0056fc03376 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -210,9 +210,10 @@ template <> struct MappingTraits { StringValue()); // Don't print out when it's empty. YamlIO.mapOptional("flags", Reg.RegisterFlags, std::vector()); - if(!YamlIO.outputting() || !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present + if (!YamlIO.outputting() || + !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present YamlIO.mapOptional("anti-hints", Reg.AntiHints, - std::vector()); // for anti-hints + std::vector()); // for anti-hints } } diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index bcee5d6b30439..5f00aeebb46fe 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -880,20 +880,21 @@ class MachineRegisterInfo { AntiHints.push_back(AntiHintVReg); } - /// addRegAllocationAntiHint - Add multiple anti-hints at once - void addRegAllocationAntiHints(Register VReg, ArrayRef AntiHintVRegs) { + /// addRegAllocationAntiHint - Add multiple anti-hints at once. + void addRegAllocationAntiHints(Register VReg, + ArrayRef AntiHintVRegs) { for (Register AntiHint : AntiHintVRegs) setRegAllocationAntiHint(VReg, AntiHint); } - /// clearRegAllocationAntiHints - Clear all anti-hints for a register + /// clearRegAllocationAntiHints - Clear all anti-hints for a register. void clearRegAllocationAntiHints(Register VReg) { assert(VReg.isVirtual()); if (AntiHintRegs.inBounds(VReg)) AntiHintRegs[VReg].clear(); } - /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg + /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg. ArrayRef getRegAllocationAntiHints(Register VReg) const { assert(VReg.isVirtual()); if (!AntiHintRegs.inBounds(VReg)) @@ -901,7 +902,7 @@ class MachineRegisterInfo { return AntiHintRegs[VReg]; } - /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint + /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint. bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const { assert(VReg.isVirtual() && AntiHintVReg.isVirtual()); if (!AntiHintRegs.inBounds(VReg)) @@ -910,11 +911,11 @@ class MachineRegisterInfo { return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end(); } - /// getPhysRegAntiHints - Get the set of physical registers to avoid based on - /// anti-hints and current allocations. This is called during allocation. + /// getPhysRegAntiHints - Get the set of physical registers to avoid. /// VRM is the current virtual register map showing allocations made so far. - void getPhysRegAntiHints(Register VReg, SmallVectorImpl &PhysAntiHints, - const VirtRegMap *VRM) const; + void getPhysRegAntiHints(Register VReg, + SmallVectorImpl &PhysAntiHints, + const VirtRegMap *VRM) const; /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the /// specified register as undefined which causes the DBG_VALUE to be diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp index f420c96e212d0..8550759f97e8a 100644 --- a/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/llvm/lib/CodeGen/AllocationOrder.cpp @@ -49,7 +49,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, // Get anti-hints SmallVector AntiHintedPhysRegs; MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM); - + LLVM_DEBUG({ if (!AntiHintedPhysRegs.empty()) { dbgs() << "anti-hints:"; @@ -58,14 +58,14 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, dbgs() << '\n'; } }); - + // Create allocation order object AllocationOrder AO(std::move(Hints), Order, HardHints); - + // Apply anti-hints filtering if needed if (!AntiHintedPhysRegs.empty()) { AO.applyAntiHints(AntiHintedPhysRegs, TRI); - + LLVM_DEBUG({ if (!AO.Hints.empty()) { dbgs() << "filtered hints:"; @@ -76,38 +76,37 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, }); } - assert(all_of(AO.Hints, [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) && "Target hint is outside allocation order."); return AO; } -void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, +void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, const TargetRegisterInfo *TRI) { // Create filtered order FilteredOrderStorage.clear(); FilteredOrderStorage.reserve(Order.size()); - + // Add non-anti-hinted registers first for (MCPhysReg PhysReg : Order) { if (!is_contained(AntiHintedPhysRegs, PhysReg)) { FilteredOrderStorage.push_back(PhysReg); } } - + // Add anti-hinted registers at the end as last resort for (MCPhysReg PhysReg : Order) { if (is_contained(AntiHintedPhysRegs, PhysReg)) { FilteredOrderStorage.push_back(PhysReg); } } - + // Update Order Order = FilteredOrderStorage; - + LLVM_DEBUG({ - dbgs() << "moved " << AntiHintedPhysRegs.size() + dbgs() << "moved " << AntiHintedPhysRegs.size() << " anti-hinted registers to end of allocation order\n"; }); } diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h index 029d9c83baf35..cda5fd08e0af6 100644 --- a/llvm/lib/CodeGen/AllocationOrder.h +++ b/llvm/lib/CodeGen/AllocationOrder.h @@ -119,11 +119,10 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder { static_cast(std::numeric_limits::max())); return Reg.isPhysical() && is_contained(Hints, Reg.id()); } - + /// Apply anti-hints to the allocation order. - void applyAntiHints(ArrayRef AntiHintedPhysRegs, + void applyAntiHints(ArrayRef AntiHintedPhysRegs, const TargetRegisterInfo *TRI); - }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index d63f8040de331..f1c89d03a3281 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -739,13 +739,12 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, for (const auto &AntiHintValue : VReg.AntiHints) { if (Info.Kind != VRegInfo::NORMAL) return error(VReg.Class.SourceRange.Start, - Twine("anti-hints can only be set for normal vregs")); + Twine("anti-hints can only be set for normal vregs")); Register AntiHintReg; - if (parseRegisterReference(PFS, AntiHintReg, - AntiHintValue.Value, Error)) + if (parseRegisterReference(PFS, AntiHintReg, AntiHintValue.Value, Error)) return error(Error, AntiHintValue.SourceRange); - + Info.AntiHints.push_back(AntiHintReg); } diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index bbf03830b3bd5..6ecc2119840d1 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -676,16 +676,16 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { return false; } -void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, - SmallVectorImpl &PhysAntiHints, - const VirtRegMap *VRM) const { +void MachineRegisterInfo::getPhysRegAntiHints( + Register VReg, SmallVectorImpl &PhysAntiHints, + const VirtRegMap *VRM) const { assert(VReg.isVirtual()); if (!AntiHintRegs.inBounds(VReg) || !VRM) return; - + const auto &AntiHints = AntiHintRegs[VReg]; const TargetRegisterInfo *TRI = getTargetRegisterInfo(); - + for (Register AntiHintVReg : AntiHints) { // Check if the anti-hinted register has been allocated if (VRM->hasPhys(AntiHintVReg)) { @@ -696,7 +696,7 @@ void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, } } } - + // Remove duplicates llvm::sort(PhysAntiHints); PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end()); From 893d522a057e3a5df78cf44cf194a707264f98b4 Mon Sep 17 00:00:00 2001 From: mssefat Date: Tue, 23 Sep 2025 11:48:50 -0400 Subject: [PATCH 18/20] Modified flag name to reflect anti-hints --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 4 ++-- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 6 ++++-- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 14 ++++++-------- .../llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 6 ------ .../AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir | 2 +- 5 files changed, 13 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 5f00aeebb46fe..6c3d4c9b2515b 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -874,7 +874,7 @@ class MachineRegisterInfo { assert(VReg.isVirtual() && "Anti-hints are only for virtual registers"); assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual"); AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs())); - auto &AntiHints = AntiHintRegs[VReg]; + SmallVector &AntiHints = AntiHintRegs[VReg]; // Avoid duplicates if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end()) AntiHints.push_back(AntiHintVReg); @@ -907,7 +907,7 @@ class MachineRegisterInfo { assert(VReg.isVirtual() && AntiHintVReg.isVirtual()); if (!AntiHintRegs.inBounds(VReg)) return false; - const auto &AntiHints = AntiHintRegs[VReg]; + const SmallVector &AntiHints = AntiHintRegs[VReg]; return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end(); } diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 6ecc2119840d1..0b49fb0a08b94 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -11,12 +11,14 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -683,7 +685,7 @@ void MachineRegisterInfo::getPhysRegAntiHints( if (!AntiHintRegs.inBounds(VReg) || !VRM) return; - const auto &AntiHints = AntiHintRegs[VReg]; + const SmallVector &AntiHints = AntiHintRegs[VReg]; const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (Register AntiHintVReg : AntiHints) { @@ -700,4 +702,4 @@ void MachineRegisterInfo::getPhysRegAntiHints( // Remove duplicates llvm::sort(PhysAntiHints); PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end()); -} \ No newline at end of file +} diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 098ca1120c85c..0a08cbbdbf2dc 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -34,7 +34,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -44,11 +43,11 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" -static cl::opt EnableRegisterAvoidListForMFMARegs( - "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden, - cl::desc("Enable Register Avoidance for " - "MFMA in GCNPreRAOptimizations stage."), - cl::init(true)); +static cl::opt + EnableAntiHintsForMFMARegs("amdgpu-anti-hints-for-mfma", cl::Hidden, + cl::desc("Enable Anti-Hints for " + "MFMA in GCNPreRAOptimizations stage."), + cl::init(true)); namespace { @@ -256,10 +255,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { bool Changed = false; // Single pass implementation - if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) { + if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) { // Max lookback window for RAW or WAW hazard constexpr unsigned MaxLookbackWindow = 19; - SIMachineFunctionInfo *MFI = MF.getInfo(); for (const MachineBasicBlock &MBB : MF) { SmallVector>, 16> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir index 89ac0978a0f72..58c7d71089b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir @@ -1,11 +1,5 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s --- | - ; ModuleID = '/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir' - source_filename = "/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir" - target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" - target triple = "amdgcn-amd-amdhsa" - - ; Function Attrs: nounwind define amdgpu_kernel void @test_software_pipelining() #0 { bb.0: ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir index d360eccaeb773..ba89b09539113 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-anti-hints-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT --- | target triple = "amdgcn-amd-amdhsa" From 9fe1c13085ab127aa437d712b9810e7e7721cf52 Mon Sep 17 00:00:00 2001 From: mssefat Date: Thu, 25 Sep 2025 17:58:12 -0400 Subject: [PATCH 19/20] [NFC] Restore and remove to move MIR serialization changes to separate PR (cherry picked from commit 7732ae8ae1080ab030db1939141350abc7aa265d) --- .../include/llvm/CodeGen/MIRParser/MIParser.h | 1 - llvm/include/llvm/CodeGen/MIRYamlMapping.h | 6 - llvm/lib/CodeGen/MIRParser/MIRParser.cpp | 18 -- llvm/lib/CodeGen/MIRPrinter.cpp | 11 -- ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 171 ------------------ ...vm.amdgcn.mfma.anti-hints-print.gfx942.mir | 126 ------------- 6 files changed, 333 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h index cf7a56587397d..0f2898d3554d0 100644 --- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h +++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h @@ -45,7 +45,6 @@ struct VRegInfo { } D; Register VReg; Register PreferredReg; - SmallVector AntiHints; // Anti-hints uint8_t Flags = 0; }; diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 9c0056fc03376..e80c13885805b 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -192,7 +192,6 @@ struct VirtualRegisterDefinition { StringValue Class; StringValue PreferredRegister; std::vector RegisterFlags; - std::vector AntiHints; // TODO: Serialize the target specific register hints. @@ -210,11 +209,6 @@ template <> struct MappingTraits { StringValue()); // Don't print out when it's empty. YamlIO.mapOptional("flags", Reg.RegisterFlags, std::vector()); - if (!YamlIO.outputting() || - !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present - YamlIO.mapOptional("anti-hints", Reg.AntiHints, - std::vector()); // for anti-hints - } } static const bool flow = true; diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index f1c89d03a3281..0f792b0ef206c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -735,19 +735,6 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, FlagStringValue.Value + "'"); Info.Flags |= FlagValue; } - - for (const auto &AntiHintValue : VReg.AntiHints) { - if (Info.Kind != VRegInfo::NORMAL) - return error(VReg.Class.SourceRange.Start, - Twine("anti-hints can only be set for normal vregs")); - - Register AntiHintReg; - if (parseRegisterReference(PFS, AntiHintReg, AntiHintValue.Value, Error)) - return error(Error, AntiHintValue.SourceRange); - - Info.AntiHints.push_back(AntiHintReg); - } - RegInfo.noteNewVirtualRegister(Info.VReg); } @@ -814,11 +801,6 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, MRI.setRegClass(Reg, Info.D.RC); if (Info.PreferredReg != 0) MRI.setSimpleHint(Reg, Info.PreferredReg); - - for (Register AntiHint : Info.AntiHints) { - if (AntiHint != 0) - MRI.setRegAllocationAntiHint(Reg, AntiHint); - } break; case VRegInfo::GENERIC: break; diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 1a88ff279a3c2..bf8a6cdf097a9 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -316,17 +316,6 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF, if (PreferredReg) printRegMIR(PreferredReg, VReg.PreferredRegister, TRI); printRegFlags(Reg, VReg.RegisterFlags, MF, TRI); - // Print the anti-hints. - const auto &AntiHints = RegInfo.getRegAllocationAntiHints(Reg); - if (!AntiHints.empty()) { - std::vector AntiHintStrings; - for (Register AntiHint : AntiHints) { - yaml::FlowStringValue AntiHintStr; - printRegMIR(AntiHint, AntiHintStr, TRI); - AntiHintStrings.push_back(std::move(AntiHintStr)); - } - VReg.AntiHints = std::move(AntiHintStrings); - } YamlMF.VirtualRegisters.push_back(std::move(VReg)); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir deleted file mode 100644 index 58c7d71089b1b..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir +++ /dev/null @@ -1,171 +0,0 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s ---- | - define amdgpu_kernel void @test_software_pipelining() #0 { - bb.0: - ret void - } - - attributes #0 = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="2" "frame-pointer"="none" "target-cpu"="gfx942" } -... ---- -name: test_software_pipelining -registers: - - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%4', - '%25', - '%5', - '%22', - '%6', - '%19', - '%7', - '%18', - '%16', - '%8' ] } - - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%8', - '%6', - '%19', - '%7', - '%18', - '%16' ] } - - { id: 2, class: sgpr_128, preferred-register: '', flags: [ ] } - - { id: 3, class: vgpr_32, preferred-register: '', flags: [ ] } - - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%4', - '%25', - '%5', - '%22', - '%6', - '%19', - '%7', - '%18', - '%16', - '%8' ] } - - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] } - - { id: 11, class: vgpr_32, preferred-register: '', flags: [ ] } - - { id: 12, class: vgpr_32, preferred-register: '', flags: [ ] } - - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%8' ] } - - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] } - - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%6', '%19', '%7', '%18', '%16', '%8' ] } - - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] } - - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [ ] } - - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%4', - '%25', - '%5', - '%22', - '%6', - '%19', - '%7', - '%18', - '%16', - '%8' ] } - - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - '%4', - '%25', - '%5', - '%22', - '%6', - '%19', - '%7', - '%18', - '%16', - '%8' ] } - - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [ ], - anti-hints: [ '%4', '%25', '%5', '%22', '%6', '%19', '%7', '%18', - '%16', '%8' ] } -body: | - bb.0: - ; CHECK-LABEL: name: test_software_pipelining - ; CHECK: renamable $vgpr36 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr37 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr38 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr24 = V_ADD_U32_e32 4096, $vgpr38, implicit $exec - ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 $vgpr36, killed $vgpr20, implicit $exec - ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 4096, killed $vgpr20, implicit $exec - ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr20, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr24, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr24_vgpr25, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128_gfx9 renamable $vgpr37, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr26_vgpr27, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr20_vgpr21, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr22_vgpr23, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr24_vgpr25, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 killed renamable $vgpr37, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr26_vgpr27, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr20_vgpr21, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, killed $vgpr22_vgpr23, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr32_vgpr33, killed $vgpr24_vgpr25, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; CHECK-NEXT: dead renamable $vgpr20 = V_ADD_U32_e32 killed $vgpr36, killed $vgpr20, implicit $exec - ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr38, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr28_vgpr29_vgpr30_vgpr31, implicit killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, implicit killed renamable $vgpr12_vgpr13_vgpr14_vgpr15, implicit killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed renamable $vgpr20_vgpr21_vgpr22_vgpr23 - %0:vgpr_32 = IMPLICIT_DEF - %1:vgpr_32 = IMPLICIT_DEF - %2:sgpr_128 = IMPLICIT_DEF - %3:vgpr_32 = IMPLICIT_DEF - %4:vreg_128_align2 = IMPLICIT_DEF - %5:vreg_128_align2 = IMPLICIT_DEF - %6:vreg_128_align2 = IMPLICIT_DEF - %7:vreg_128_align2 = IMPLICIT_DEF - %8:vreg_128_align2 = IMPLICIT_DEF - %9:vgpr_32 = IMPLICIT_DEF - %10:vgpr_32 = V_ADD_U32_e32 4096, %9, implicit $exec - %11:vgpr_32 = V_ADD_U32_e32 %0, %3, implicit $exec - %12:vgpr_32 = V_ADD_U32_e32 4096, %11, implicit $exec - %13:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %12, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %14:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %10, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %15:vreg_128_align2 = IMPLICIT_DEF - %16:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %14.sub0_sub1, %8, 0, 0, 0, implicit $mode, implicit $exec - %17:vreg_128_align2 = DS_READ_B128_gfx9 %1, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - dead %18:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %14.sub2_sub3, %16, 0, 0, 0, implicit $mode, implicit $exec - %19:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %13.sub0_sub1, %7, 0, 0, 0, implicit $mode, implicit $exec - %20:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %13.sub2_sub3, %19, 0, 0, 0, implicit $mode, implicit $exec - %21:vreg_128_align2 = IMPLICIT_DEF - %22:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %14.sub0_sub1, %6, 0, 0, 0, implicit $mode, implicit $exec - %23:vreg_128_align2 = DS_READ_B128_gfx9 %1, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - %24:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %14.sub2_sub3, %22, 0, 0, 0, implicit $mode, implicit $exec - %25:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %13.sub0_sub1, %5, 0, 0, 0, implicit $mode, implicit $exec - %26:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %13.sub2_sub3, %25, 0, 0, 0, implicit $mode, implicit $exec - %27:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %17.sub0_sub1, %14.sub0_sub1, %4, 0, 0, 0, implicit $mode, implicit $exec - %28:vgpr_32 = IMPLICIT_DEF - dead %29:vgpr_32 = V_ADD_U32_e32 %0, %28, implicit $exec - %30:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %9, %2, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - S_ENDPGM 0, implicit %23, implicit %24, implicit %20, implicit %26, implicit %27, implicit %30 -... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir deleted file mode 100644 index c6de026d447fd..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir +++ /dev/null @@ -1,126 +0,0 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=CHECK %s - ---- | - target triple = "amdgcn-amd-amdhsa" - - define amdgpu_kernel void @test_software_pipelining() #0 { - bb.0: - ret void - } - - attributes #0 = {nounwind "amdgpu-waves-per-eu"="2" "amdgpu-agpr-alloc"="0" "frame-pointer"="none"} - -... ---- -name: test_software_pipelining -body: | - bb.0: - ; CHECK-LABEL: name: test_software_pipelining - ; CHECK: registers: - ; CHECK: - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%4', - ; CHECK:{{\s*}}'%8' ] } - ; CHECK: - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%8', - ; CHECK:{{\s*}}'%16' ] } - ; CHECK: - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } - ; CHECK: - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } - ; CHECK: - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } - ; CHECK: - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } - ; CHECK: - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] } - ; CHECK: - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%4', - ; CHECK:{{\s*}}'%8' ] } - ; CHECK: - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } - ; CHECK: - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%8' ] } - ; CHECK: - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } - ; CHECK: - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] } - ; CHECK: - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } - ; CHECK: - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%6'{{.*}}'%8' ] } - ; CHECK: - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [ ], - ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] } - ; CHECK: - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%4', - ; CHECK:{{\s*}}'%8' ] } - ; CHECK: - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ], anti-hints: [ - ; CHECK-NEXT:{{\s*}}'%4', - ; CHECK:{{\s*}}'%8' ] } - ; CHECK: - { id: 30, class: vreg_128_align2 - ; CHECK-NEXT: {{.*}}anti-hints: [ '%4' - ; CHECK: {{.*}}'%8' ] } - ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[DEF9]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF3]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[V_ADD_U32_e32_1]], implicit $exec - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_2]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF8]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 4096, 0, implicit $exec :: (load (s128), addrspace 3) - ; CHECK-NEXT: dead [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF7]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 6144, 0, implicit $exec :: (load (s128), addrspace 3) - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF5]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF4]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: dead [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF12]], implicit $exec - ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF9]], [[DEF2]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[DS_READ_B128_gfx9_1]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], implicit [[BUFFER_LOAD_DWORDX4_OFFEN2]] - %3:vgpr_32 = IMPLICIT_DEF - %4:vgpr_32 = IMPLICIT_DEF - %6:sgpr_128 = IMPLICIT_DEF - %7:vgpr_32 = IMPLICIT_DEF - %19:vreg_128_align2 = IMPLICIT_DEF - %20:vreg_128_align2 = IMPLICIT_DEF - %21:vreg_128_align2 = IMPLICIT_DEF - %22:vreg_128_align2 = IMPLICIT_DEF - %23:vreg_128_align2 = IMPLICIT_DEF - %25:vgpr_32 = IMPLICIT_DEF - %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec - %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec - %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec - %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - %31:vreg_128_align2 = IMPLICIT_DEF - %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3) - %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %37:vreg_128_align2 = IMPLICIT_DEF - %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3) - %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec - %43:vgpr_32 = IMPLICIT_DEF - %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec - %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - S_ENDPGM 0, implicit %38, implicit %39, implicit %35, implicit %41, implicit %42, implicit %44 -... From 6d8e0447d2861a2646cb45adddddf579fa0cbba8 Mon Sep 17 00:00:00 2001 From: mssefat Date: Fri, 26 Sep 2025 12:19:21 -0400 Subject: [PATCH 20/20] Named operand and stable partition applied (cherry picked from commit ee6d876fcc3d84d6ea3a68b3eee1ce97e714b6e6) --- .../llvm/CodeGen/MachineRegisterInfo.h | 10 +- llvm/lib/CodeGen/AllocationOrder.cpp | 39 +- llvm/lib/CodeGen/AllocationOrder.h | 2 + llvm/lib/CodeGen/MachineRegisterInfo.cpp | 15 +- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 53 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 92 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 120 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 64 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 312 ++--- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 1071 ++++++----------- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 67 +- 11 files changed, 749 insertions(+), 1096 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 6c3d4c9b2515b..0cfb8454dcd99 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -867,13 +867,13 @@ class MachineRegisterInfo { return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr; } - /// setRegAllocationAntiHint - Add a register allocation anti-hint for the + /// addRegAllocAntiHint - Add a register allocation anti-hint for the /// specified virtual register. This tells the allocator to avoid allocating /// VReg to the same physical register as AntiHintVReg (or overlapping ones). - void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) { + void addRegAllocAntiHint(Register VReg, Register AntiHintVReg) { assert(VReg.isVirtual() && "Anti-hints are only for virtual registers"); assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual"); - AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs())); + AntiHintRegs.grow(VReg); SmallVector &AntiHints = AntiHintRegs[VReg]; // Avoid duplicates if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end()) @@ -884,7 +884,7 @@ class MachineRegisterInfo { void addRegAllocationAntiHints(Register VReg, ArrayRef AntiHintVRegs) { for (Register AntiHint : AntiHintVRegs) - setRegAllocationAntiHint(VReg, AntiHint); + addRegAllocAntiHint(VReg, AntiHint); } /// clearRegAllocationAntiHints - Clear all anti-hints for a register. @@ -915,7 +915,7 @@ class MachineRegisterInfo { /// VRM is the current virtual register map showing allocations made so far. void getPhysRegAntiHints(Register VReg, SmallVectorImpl &PhysAntiHints, - const VirtRegMap *VRM) const; + const VirtRegMap &VRM) const; /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the /// specified register as undefined which causes the DBG_VALUE to be diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp index 8550759f97e8a..32005fd6ff837 100644 --- a/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/llvm/lib/CodeGen/AllocationOrder.cpp @@ -48,7 +48,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, // Get anti-hints SmallVector AntiHintedPhysRegs; - MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM); + MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, VRM); LLVM_DEBUG({ if (!AntiHintedPhysRegs.empty()) { @@ -84,29 +84,34 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM, void AllocationOrder::applyAntiHints(ArrayRef AntiHintedPhysRegs, const TargetRegisterInfo *TRI) { + // Helper to check if a register overlaps with any anti-hint + auto isAntiHinted = [&](MCPhysReg Reg) { + return std::any_of( + AntiHintedPhysRegs.begin(), AntiHintedPhysRegs.end(), + [&](MCPhysReg AntiHint) { return TRI->regsOverlap(Reg, AntiHint); }); + }; + // Create filtered order FilteredOrderStorage.clear(); - FilteredOrderStorage.reserve(Order.size()); - - // Add non-anti-hinted registers first - for (MCPhysReg PhysReg : Order) { - if (!is_contained(AntiHintedPhysRegs, PhysReg)) { - FilteredOrderStorage.push_back(PhysReg); - } - } + FilteredOrderStorage.assign(Order.begin(), Order.end()); - // Add anti-hinted registers at the end as last resort - for (MCPhysReg PhysReg : Order) { - if (is_contained(AntiHintedPhysRegs, PhysReg)) { - FilteredOrderStorage.push_back(PhysReg); - } - } + // Partition: non-anti-hinted registers go first + auto PartitionPoint = std::stable_partition( + FilteredOrderStorage.begin(), FilteredOrderStorage.end(), + [&](MCPhysReg Reg) { return !isAntiHinted(Reg); }); // Update Order Order = FilteredOrderStorage; LLVM_DEBUG({ - dbgs() << "moved " << AntiHintedPhysRegs.size() - << " anti-hinted registers to end of allocation order\n"; + size_t NonAntiHintedCount = + std::distance(FilteredOrderStorage.begin(), PartitionPoint); + size_t AntiHintedCount = + std::distance(PartitionPoint, FilteredOrderStorage.end()); + dbgs() << " Added " << NonAntiHintedCount + << " non-anti-hinted registers first\n" + << " Added " << AntiHintedCount + << " anti-hinted registers at the end\n" + << " Anti-hint filtering complete\n"; }); } diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h index cda5fd08e0af6..0b10272731444 100644 --- a/llvm/lib/CodeGen/AllocationOrder.h +++ b/llvm/lib/CodeGen/AllocationOrder.h @@ -30,6 +30,8 @@ class LiveRegMatrix; class LLVM_LIBRARY_VISIBILITY AllocationOrder { const SmallVector Hints; + // Used as storage if the Order received in the constructor needs to be + // altered. SmallVector FilteredOrderStorage; ArrayRef Order; // How far into the Order we can iterate. This is 0 if the AllocationOrder is diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 0b49fb0a08b94..1cd74d3561b2b 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -680,22 +680,19 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { void MachineRegisterInfo::getPhysRegAntiHints( Register VReg, SmallVectorImpl &PhysAntiHints, - const VirtRegMap *VRM) const { + const VirtRegMap &VRM) const { assert(VReg.isVirtual()); - if (!AntiHintRegs.inBounds(VReg) || !VRM) + if (!AntiHintRegs.inBounds(VReg)) return; const SmallVector &AntiHints = AntiHintRegs[VReg]; - const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (Register AntiHintVReg : AntiHints) { // Check if the anti-hinted register has been allocated - if (VRM->hasPhys(AntiHintVReg)) { - MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg); - // Add the physical register and all its aliases - for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) { - PhysAntiHints.push_back(*AI); - } + if (VRM.hasPhys(AntiHintVReg)) { + MCPhysReg PhysReg = VRM.getPhys(AntiHintVReg); + // Add the physical register + PhysAntiHints.push_back(PhysReg); } } diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 0a08cbbdbf2dc..dde4a84d45680 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -37,6 +37,7 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Register.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -253,37 +254,45 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { TRI = ST.getRegisterInfo(); bool Changed = false; - - // Single pass implementation + // Add RA anti-hints to reduce MFMA hazard NOPs if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) { // Max lookback window for RAW or WAW hazard constexpr unsigned MaxLookbackWindow = 19; for (const MachineBasicBlock &MBB : MF) { - - SmallVector>, 16> - RecentMFMAs; + SmallVector, 16> RecentMFMAs; for (const MachineInstr &MI : MBB) { if (MI.isDebugInstr()) continue; - const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot(); + // Handle MFMA instructions if (SIInstrInfo::isMFMA(MI)) { SmallVector MFMARegisters; - auto collectMFMARegister = [&](unsigned OpIdx) { - if (OpIdx >= MI.getNumOperands()) + // Helper to get named operand + auto collectNamedOperand = [&](AMDGPU::OpName OpName, + const char *OpNameStr) { + const MachineOperand *MO = TII->getNamedOperand(MI, OpName); + if (!MO) { + LLVM_DEBUG(dbgs() << " Named operand " << OpNameStr + << " not found\n"); return; - - const MachineOperand &MO = MI.getOperand(OpIdx); - if (MO.isReg() && MO.getReg().isVirtual()) - MFMARegisters.push_back(MO.getReg()); + } + if (MO->isReg() && MO->getReg().isVirtual()) { + Register Reg = MO->getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + // Only consider VGPRs + if (TRI->hasVGPRs(RC)) + MFMARegisters.push_back(Reg); + LLVM_DEBUG(dbgs() << " Collected " << OpNameStr << " : " + << printReg(Reg, TRI) << "\n"); + } }; - // Only collect Matrix C (operand 3) and destination (operand 0) - // registers - collectMFMARegister(0); - collectMFMARegister(3); + // Collect destination and source C registers + collectNamedOperand(AMDGPU::OpName::vdst, "vdst"); // Destination + collectNamedOperand(AMDGPU::OpName::src2, + "src2"); // Matrix C (accumulator) if (!MFMARegisters.empty()) { - RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters)); + RecentMFMAs.emplace_back(std::move(MFMARegisters)); // Maintain window if (RecentMFMAs.size() > MaxLookbackWindow) RecentMFMAs.erase(RecentMFMAs.begin()); @@ -309,17 +318,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { // Only process VGPR registers if (!TRI->isVGPRClass(CandidateRC)) continue; - for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) { - const SmallVector &MFMARegs = It->second; + const SmallVector &MFMARegs = *It; for (Register MFMAReg : MFMARegs) { - // Verify register class compatibility - const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg); - if (!TRI->hasVGPRs(MFMARC)) - continue; - // Check if MFMA register is dead at current instruction const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg); + const SlotIndex CurrentSlot = + LIS->getInstructionIndex(MI).getRegSlot(); if (!MFMAInterval.liveAt(CurrentSlot)) { // Add bi-directional anti-hints MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 17692a38dfc64..16ea95437881b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -731,10 +731,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0 ; GFX90A-VGPR-NEXT: s_nop 3 -; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90A-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX90A-VGPR-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64: @@ -747,10 +747,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0 ; GFX942-VGPR-NEXT: s_nop 3 -; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) @@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 @@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 @@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 -; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 @@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] +; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] ; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 07a4f33f25b17..2fb677eccc4b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -2460,7 +2460,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -2481,11 +2480,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: s_nop 10 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: @@ -2525,7 +2525,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -2546,11 +2545,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: s_nop 11 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: @@ -3607,7 +3607,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3628,11 +3627,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: s_nop 10 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: @@ -3672,7 +3672,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3693,11 +3692,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: s_nop 11 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: @@ -3910,7 +3910,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3931,11 +3930,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: s_nop 10 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: @@ -3975,7 +3975,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -3996,11 +3995,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: s_nop 11 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: @@ -4213,7 +4213,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4234,11 +4233,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: s_nop 10 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: @@ -4278,7 +4278,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4299,11 +4298,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: s_nop 11 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: @@ -4516,7 +4516,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG: ; %bb.0: ; %bb ; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4537,11 +4536,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-SDAG-NEXT: s_nop 10 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX942-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: @@ -4581,7 +4581,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG: ; %bb.0: ; %bb ; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 @@ -4602,11 +4601,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-SDAG-NEXT: s_nop 11 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[12:15], s[24:25] offset:48 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[8:11], s[24:25] offset:32 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[4:7], s[24:25] offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v17, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 13a96cfa6e650..ceeb00ba55197 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -269,26 +269,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: v_mov_b32_e32 v35, s23 ; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v32, s16 +; GCN-NEXT: v_mov_b32_e32 v33, s17 +; GCN-NEXT: v_mov_b32_e32 v34, s18 +; GCN-NEXT: v_mov_b32_e32 v35, s19 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s12 +; GCN-NEXT: v_mov_b32_e32 v33, s13 +; GCN-NEXT: v_mov_b32_e32 v34, s14 +; GCN-NEXT: v_mov_b32_e32 v35, s15 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s8 +; GCN-NEXT: v_mov_b32_e32 v33, s9 +; GCN-NEXT: v_mov_b32_e32 v34, s10 +; GCN-NEXT: v_mov_b32_e32 v35, s11 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -332,26 +332,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: v_mov_b32_e32 v35, s23 ; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v32, s16 +; GCN-NEXT: v_mov_b32_e32 v33, s17 +; GCN-NEXT: v_mov_b32_e32 v34, s18 +; GCN-NEXT: v_mov_b32_e32 v35, s19 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s12 +; GCN-NEXT: v_mov_b32_e32 v33, s13 +; GCN-NEXT: v_mov_b32_e32 v34, s14 +; GCN-NEXT: v_mov_b32_e32 v35, s15 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v32, s8 +; GCN-NEXT: v_mov_b32_e32 v33, s9 +; GCN-NEXT: v_mov_b32_e32 v34, s10 +; GCN-NEXT: v_mov_b32_e32 v35, s11 +; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index eefd7b5fea63e..3646d81ed435b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,18 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -179,18 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -198,18 +198,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -260,18 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -298,18 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -317,18 +317,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -1506,26 +1506,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1609,26 +1609,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -1666,26 +1666,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 2 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -1848,26 +1848,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v32, s16 +; SDAG-NEXT: v_mov_b32_e32 v33, s17 +; SDAG-NEXT: v_mov_b32_e32 v34, s18 +; SDAG-NEXT: v_mov_b32_e32 v35, s19 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v32, s8 +; SDAG-NEXT: v_mov_b32_e32 v33, s9 +; SDAG-NEXT: v_mov_b32_e32 v34, s10 +; SDAG-NEXT: v_mov_b32_e32 v35, s11 +; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -1951,26 +1951,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v32, s16 +; HEURRC-NEXT: v_mov_b32_e32 v33, s17 +; HEURRC-NEXT: v_mov_b32_e32 v34, s18 +; HEURRC-NEXT: v_mov_b32_e32 v35, s19 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s12 +; HEURRC-NEXT: v_mov_b32_e32 v33, s13 +; HEURRC-NEXT: v_mov_b32_e32 v34, s14 +; HEURRC-NEXT: v_mov_b32_e32 v35, s15 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v32, s8 +; HEURRC-NEXT: v_mov_b32_e32 v33, s9 +; HEURRC-NEXT: v_mov_b32_e32 v34, s10 +; HEURRC-NEXT: v_mov_b32_e32 v35, s11 +; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) @@ -2008,26 +2008,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: s_nop 2 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s19 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s15 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 -; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s11 +; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) @@ -5411,18 +5411,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5430,18 +5430,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5449,18 +5449,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -5511,18 +5511,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5530,18 +5530,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5549,18 +5549,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v8, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 9b9d11502d413..ee11b9295a24a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -245,41 +245,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: @@ -324,41 +307,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: @@ -403,41 +369,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: @@ -723,41 +672,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -767,41 +699,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result @@ -811,41 +726,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 11 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result @@ -1144,41 +1042,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: @@ -1223,41 +1104,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: @@ -1302,41 +1166,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: @@ -2202,41 +2049,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: @@ -2281,41 +2111,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: @@ -2360,41 +2173,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: @@ -2604,41 +2400,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: @@ -2683,41 +2462,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: @@ -2762,41 +2524,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: @@ -3006,41 +2751,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: @@ -3085,41 +2813,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: @@ -3164,41 +2875,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: @@ -3408,41 +3102,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: @@ -3487,41 +3164,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: @@ -3566,41 +3226,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index ac88f8d550f9c..4e20c999f5309 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,68 +369,69 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v22, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x7fc00000 ; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; CHECK-NEXT: v_accvgpr_write_b32 a3, s1 +; CHECK-NEXT: v_accvgpr_write_b32 a2, s0 +; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: v_mov_b32_e32 v18, v16 +; CHECK-NEXT: v_mov_b32_e32 v19, v16 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[2:3], v[4:7] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, s0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, s1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7] -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: v_mov_b32_e32 v7, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11] -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7] +; CHECK-NEXT: v_accvgpr_write_b32 a5, s1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], a[0:1], a[0:1], v[16:19] +; CHECK-NEXT: v_accvgpr_write_b32 a4, s0 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], a[0:1], a[4:5], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[28:31], a[0:1], a[0:1], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[0:1], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], a[0:1], v[4:7] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], a[0:1], a[2:3], v[4:7] +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v28 +; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], a[0:1], a[0:1], v[12:15] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], a[0:1], v[0:3] ; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15] -; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3] -; CHECK-NEXT: global_store_short v[12:13], v17, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], a[0:1], a[0:1], v[4:7] +; CHECK-NEXT: global_store_short v[12:13], v9, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7] -; CHECK-NEXT: global_store_short v[12:13], v9, off -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27] +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v10 +; CHECK-NEXT: global_store_short v[12:13], v8, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], a[0:1], a[0:1], v[24:27] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 ; CHECK-NEXT: global_store_short v[12:13], v1, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23] +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], a[0:1], a[0:1], v[20:23] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 ; CHECK-NEXT: global_store_short v[12:13], v14, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[2:3], a[0:1], v[8:11] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11] -; CHECK-NEXT: s_nop 6 +; CHECK-NEXT: s_nop 3 ; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[4:5], a[0:1], v[4:7] ; CHECK-NEXT: global_store_short v[12:13], v8, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0)