diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 1e3562b37d87c..e1647b76702c4 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, // really much we can do to fix this. // Some special instructions use M0 as an input. Some even only use // the first lane. Insert a readfirstlane and hope for the best. - if (DstReg == AMDGPU::M0 && - TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg) .add(MI.getOperand(1)); + + unsigned SubReg = MI.getOperand(1).getSubReg(); MI.getOperand(1).setReg(TmpReg); + MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister); + + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, MI.getDebugLoc())) { I = std::next(I); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3bf820a0024e7..f5b52425e7841 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8177,26 +8177,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) { Register NewDstReg = Inst.getOperand(1).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - MRI.clearKillFlags(NewDstReg); - Inst.getOperand(0).setReg(DstReg); - Inst.eraseFromParent(); - // Legalize t16 operand since replaceReg is called after addUsersToVALU - for (MachineOperand &MO : - make_early_inc_range(MRI.use_operands(NewDstReg))) { - legalizeOperandsVALUt16(*MO.getParent(), MRI); + const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg); + if (const TargetRegisterClass *CommonRC = + RI.getCommonSubClass(NewDstRC, SrcRC)) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, NewDstReg); + MRI.clearKillFlags(NewDstReg); + Inst.getOperand(0).setReg(DstReg); + + if (!MRI.constrainRegClass(NewDstReg, CommonRC)) + llvm_unreachable("failed to constrain register"); + + Inst.eraseFromParent(); + // Legalize t16 operand since replaceReg is called after addUsersToVALU + for (MachineOperand &MO : + make_early_inc_range(MRI.use_operands(NewDstReg))) { + legalizeOperandsVALUt16(*MO.getParent(), MRI); + } + + return; } - return; } // If this is a v2s copy between 16bit and 32bit reg, diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 196958b74442f..ae53bdff7c251 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -10733,15 +10733,16 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -11000,15 +11001,16 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -19023,15 +19025,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -19282,15 +19285,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll index aede91b76f441..a13f3513c660e 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll @@ -43,26 +43,25 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge ; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s6, s32, 6 ; CHECK-NEXT: v_mov_b32_e32 v7, v2 ; CHECK-NEXT: v_mov_b32_e32 v6, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_u32_e32 v8, 1, v3 -; CHECK-NEXT: v_lshl_add_u32 v5, v3, 2, v1 -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v8 -; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1 +; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: v_add_u32_e32 v2, 1, v3 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v3, v4 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %done ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dword v[6:7], v0, off ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll index b05b89fe503f2..116f46df01049 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-readfirstlane-av-register-regression.ll @@ -49,4 +49,19 @@ bb16: ; preds = %bb16, %bb br label %bb16 } - +define void @av_class_to_m0(ptr addrspace(1) %ptr) { +; CHECK-LABEL: av_class_to_m0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: s_mov_b32 m0, s4 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use m0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %load = load i32, ptr addrspace(1) %ptr + call void asm sideeffect "; use $0", "{m0}"(i32 %load) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index fe432e9d7594d..331a29b3f4a93 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -101,39 +101,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader @@ -160,39 +160,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader @@ -333,6 +333,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 @@ -365,7 +366,6 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader @@ -393,6 +393,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 @@ -425,7 +426,6 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader @@ -559,39 +559,39 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader @@ -618,39 +618,39 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader @@ -821,71 +821,71 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader @@ -912,71 +912,71 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader @@ -1111,39 +1111,39 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader @@ -1171,39 +1171,39 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader @@ -1376,46 +1376,47 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 @@ -1437,46 +1438,47 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a31, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 @@ -1641,42 +1643,43 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader @@ -1703,42 +1706,43 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader @@ -2090,57 +2094,152 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v32 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX90A-NEXT: v_accvgpr_mov_b32 a33, a31 +; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a30 +; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a29 +; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a28 +; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a27 +; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a26 +; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a25 +; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a24 +; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a23 +; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a22 +; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a21 +; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a20 +; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a19 +; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a18 +; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a17 +; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a16 +; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a15 +; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a14 +; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a13 +; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a12 +; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a11 +; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a10 +; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a9 +; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a8 +; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a7 +; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a6 +; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a5 +; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a4 +; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a3 +; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a2 +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 +; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[2:33], v0, v1, a[2:33] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a31 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a33 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 -; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v0, a[30:33], s[0:1] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[26:29], s[0:1] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[22:25], s[0:1] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[18:21], s[0:1] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_agpr_init: @@ -2152,57 +2251,152 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a31, v32 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v31 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 +; GFX942-NEXT: v_accvgpr_mov_b32 a33, a31 +; GFX942-NEXT: v_accvgpr_mov_b32 a32, a30 +; GFX942-NEXT: v_accvgpr_mov_b32 a31, a29 +; GFX942-NEXT: v_accvgpr_mov_b32 a30, a28 +; GFX942-NEXT: v_accvgpr_mov_b32 a29, a27 +; GFX942-NEXT: v_accvgpr_mov_b32 a28, a26 +; GFX942-NEXT: v_accvgpr_mov_b32 a27, a25 +; GFX942-NEXT: v_accvgpr_mov_b32 a26, a24 +; GFX942-NEXT: v_accvgpr_mov_b32 a25, a23 +; GFX942-NEXT: v_accvgpr_mov_b32 a24, a22 +; GFX942-NEXT: v_accvgpr_mov_b32 a23, a21 +; GFX942-NEXT: v_accvgpr_mov_b32 a22, a20 +; GFX942-NEXT: v_accvgpr_mov_b32 a21, a19 +; GFX942-NEXT: v_accvgpr_mov_b32 a20, a18 +; GFX942-NEXT: v_accvgpr_mov_b32 a19, a17 +; GFX942-NEXT: v_accvgpr_mov_b32 a18, a16 +; GFX942-NEXT: v_accvgpr_mov_b32 a17, a15 +; GFX942-NEXT: v_accvgpr_mov_b32 a16, a14 +; GFX942-NEXT: v_accvgpr_mov_b32 a15, a13 +; GFX942-NEXT: v_accvgpr_mov_b32 a14, a12 +; GFX942-NEXT: v_accvgpr_mov_b32 a13, a11 +; GFX942-NEXT: v_accvgpr_mov_b32 a12, a10 +; GFX942-NEXT: v_accvgpr_mov_b32 a11, a9 +; GFX942-NEXT: v_accvgpr_mov_b32 a10, a8 +; GFX942-NEXT: v_accvgpr_mov_b32 a9, a7 +; GFX942-NEXT: v_accvgpr_mov_b32 a8, a6 +; GFX942-NEXT: v_accvgpr_mov_b32 a7, a5 +; GFX942-NEXT: v_accvgpr_mov_b32 a6, a4 +; GFX942-NEXT: v_accvgpr_mov_b32 a5, a3 +; GFX942-NEXT: v_accvgpr_mov_b32 a4, a2 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[2:33], v0, v1, a[2:33] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a31 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a32 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a33 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GFX942-NEXT: ; %bb.2: ; %exit +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dwordx4 v0, a[30:33], s[0:1] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[26:29], s[0:1] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[22:25], s[0:1] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[18:21], s[0:1] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -2609,39 +2803,39 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader @@ -2690,39 +2884,39 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader @@ -2867,39 +3061,39 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX90A-NEXT: s_mov_b32 s4, 16 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader @@ -2948,39 +3142,39 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 2462414992e36..12efca7dcadb5 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir index ac4f41282ab73..03e3ff95bbad2 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-av-constrain.mir @@ -90,3 +90,22 @@ body: | S_ENDPGM 0 ... +--- +name: constrain_readfirstlane_av64_subreg_m0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: constrain_readfirstlane_av64_subreg_m0 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec + ; CHECK-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_]] + %0:sreg_32 = IMPLICIT_DEF + %1:av_64 = COPY $vgpr0_vgpr1 + $m0 = COPY %1.sub0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll new file mode 100644 index 0000000000000..93d864246d68d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s + +; Make sure SIFixSGPRCopies handles situations where it needs to fix +; up copies to physical registers from an AV virtual register. + +define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) { +; CHECK-LABEL: fix_sgpr_copies_indirect_call: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 4 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: s_add_i32 s32, s32, 0x800 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v41, s15, 0 +; CHECK-NEXT: v_writelane_b32 v41, s14, 1 +; CHECK-NEXT: v_writelane_b32 v41, s13, 2 +; CHECK-NEXT: v_writelane_b32 v41, s12, 3 +; CHECK-NEXT: v_writelane_b32 v41, s10, 4 +; CHECK-NEXT: v_writelane_b32 v41, s11, 5 +; CHECK-NEXT: v_writelane_b32 v41, s8, 6 +; CHECK-NEXT: v_writelane_b32 v41, s9, 7 +; CHECK-NEXT: v_writelane_b32 v41, s6, 8 +; CHECK-NEXT: v_writelane_b32 v41, s7, 9 +; CHECK-NEXT: v_writelane_b32 v41, s4, 10 +; CHECK-NEXT: v_writelane_b32 v41, s5, 11 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: ; %bb.1: ; %bb1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_writelane_b32 v41, s4, 12 +; CHECK-NEXT: v_writelane_b32 v41, s5, 13 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s8, v1 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s8 +; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; CHECK-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; CHECK-NEXT: s_mov_b32 s7, s8 +; CHECK-NEXT: v_writelane_b32 v41, s6, 14 +; CHECK-NEXT: v_writelane_b32 v41, s7, 15 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; CHECK-NEXT: v_writelane_b32 v41, s4, 16 +; CHECK-NEXT: v_writelane_b32 v41, s5, 17 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s16, v41, 14 +; CHECK-NEXT: v_readlane_b32 s17, v41, 15 +; CHECK-NEXT: v_readlane_b32 s15, v41, 0 +; CHECK-NEXT: v_readlane_b32 s14, v41, 1 +; CHECK-NEXT: v_readlane_b32 s13, v41, 2 +; CHECK-NEXT: v_readlane_b32 s12, v41, 3 +; CHECK-NEXT: v_readlane_b32 s10, v41, 4 +; CHECK-NEXT: v_readlane_b32 s11, v41, 5 +; CHECK-NEXT: v_readlane_b32 s8, v41, 6 +; CHECK-NEXT: v_readlane_b32 s9, v41, 7 +; CHECK-NEXT: v_readlane_b32 s6, v41, 8 +; CHECK-NEXT: v_readlane_b32 s7, v41, 9 +; CHECK-NEXT: v_readlane_b32 s4, v41, 10 +; CHECK-NEXT: v_readlane_b32 s5, v41, 11 +; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] +; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] +; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_readlane_b32 s4, v41, 16 +; CHECK-NEXT: v_readlane_b32 s5, v41, 17 +; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_cbranch_execnz .LBB0_2 +; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v41, 12 +; CHECK-NEXT: v_readlane_b32 s5, v41, 13 +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 4 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %i = load ptr, ptr addrspace(5) %ptr, align 8 + br label %bb1 + +bb1: ; preds = %bb + tail call void %i() + ret i32 0 +}