From 73cf9ed3cae3c77d60062c9ce99262a070175e19 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 21 Nov 2025 14:04:59 +0000 Subject: [PATCH 1/3] Precommit test --- ...x-sgpr-copies-phi-regression-av-classes.ll | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll new file mode 100644 index 0000000000000..618a9a8a05b18 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s + +; Check that the copy from s[2:3] to v[0:1] occurs inside the loop, not after it. + +define i64 @test_temporal_divergence(i32 %arg) #0 { +; CHECK-LABEL: test_temporal_divergence: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: .LBB0_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_u32_e32 v0, -1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5] +; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_mov_b64 s[4:5], 1 +; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %end +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + br label %loop + +loop: + %i = phi i64 [ 1, %loop ], [ 0, %entry ] + %count = phi i32 [ %inc, %loop ], [ 0, %entry ] + %inc = add i32 %count, 1 + %cond = icmp eq i32 %count, %arg + br i1 %cond, label %end, label %loop + +end: + ret i64 %i +} From 14aa94c3fcfd535120622e1c12f82bf96e070812 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 21 Nov 2025 13:40:23 +0000 Subject: [PATCH 2/3] [AMDGPU] Handle AV classes in SIFixSGPRCopies::processPHINode Fix a problem exposed by #166483 using AV classes in more places. `isVectorRegister` only accepts registers of VGPR or AGPR classes. `hasVectorRegisters` additionally accepts the combined AV classes. Fixes: #168761 --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 2 +- .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 642 ++-- .../AMDGPU/av-split-dead-valno-crash.ll | 77 +- .../branch-folding-implicit-def-subreg.ll | 48 +- ...x-sgpr-copies-phi-regression-av-classes.ll | 13 +- .../CodeGen/AMDGPU/masked-load-vectortypes.ll | 3 +- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 2878 ++++++++++++----- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 2 +- .../AMDGPU/promote-constOffset-to-imm.ll | 2 +- .../AMDGPU/tuple-allocation-failure.ll | 72 +- 10 files changed, 2464 insertions(+), 1275 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index e1647b76702c4..3e4b25dd2f663 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -856,7 +856,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } - if (TRI->isVectorRegister(*MRI, PHIRes) || + if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) || RC0 == &AMDGPU::VReg_1RegClass) { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); TII->legalizeOperands(MI, MDT); diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 5cceb918b755e..b8962fa29e8f1 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB14_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB15_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB18_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -6406,35 +6406,35 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB90_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6591,35 +6591,35 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB92_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8881,28 +8881,28 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] @@ -8911,20 +8911,20 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execnz .LBB114_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB114_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc ; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen @@ -9027,29 +9027,28 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc -; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB115_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB115_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB115_6: ; %Flow2 @@ -9066,7 +9065,6 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a: @@ -9829,33 +9827,31 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB127_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB127_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB127_6: ; %Flow2 @@ -9873,7 +9869,6 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: @@ -9895,32 +9890,30 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB127_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB127_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB127_6: ; %Flow2 @@ -9939,7 +9932,6 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10407,31 +10399,31 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB132_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] @@ -10595,31 +10587,31 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB134_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] @@ -14438,30 +14430,30 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB194_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB194_3 ; GFX90A-NEXT: s_branch .LBB194_4 ; GFX90A-NEXT: .LBB194_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB194_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB194_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14475,27 +14467,27 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB194_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB194_3 ; GFX950-NEXT: s_branch .LBB194_4 ; GFX950-NEXT: .LBB194_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB194_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB194_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 @@ -14612,32 +14604,32 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB196_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB196_3 ; GFX90A-NEXT: s_branch .LBB196_4 ; GFX90A-NEXT: .LBB196_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB196_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB196_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14651,28 +14643,28 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB196_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB196_3 ; GFX950-NEXT: s_branch .LBB196_4 ; GFX950-NEXT: .LBB196_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB196_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB196_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 @@ -14791,32 +14783,32 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB198_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB198_3 ; GFX90A-NEXT: s_branch .LBB198_4 ; GFX90A-NEXT: .LBB198_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB198_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB198_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14971,32 +14963,32 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB200_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB200_3 ; GFX90A-NEXT: s_branch .LBB200_4 ; GFX90A-NEXT: .LBB200_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB200_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB200_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15400,32 +15392,32 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB204_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB204_3 ; GFX90A-NEXT: s_branch .LBB204_4 ; GFX90A-NEXT: .LBB204_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB204_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB204_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15579,32 +15571,32 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB206_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB206_3 ; GFX90A-NEXT: s_branch .LBB206_4 ; GFX90A-NEXT: .LBB206_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB206_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB206_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15761,33 +15753,33 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB208_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB208_3 ; GFX90A-NEXT: s_branch .LBB208_4 ; GFX90A-NEXT: .LBB208_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB208_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB208_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15946,33 +15938,33 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB210_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB210_3 ; GFX90A-NEXT: s_branch .LBB210_4 ; GFX90A-NEXT: .LBB210_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB210_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB210_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16131,33 +16123,33 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB212_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB212_3 ; GFX90A-NEXT: s_branch .LBB212_4 ; GFX90A-NEXT: .LBB212_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB212_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB212_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16316,33 +16308,33 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB214_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB214_3 ; GFX90A-NEXT: s_branch .LBB214_4 ; GFX90A-NEXT: .LBB214_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB214_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB214_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16697,37 +16689,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB218_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB218_3 ; GFX90A-NEXT: s_branch .LBB218_4 ; GFX90A-NEXT: .LBB218_2: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17318,49 +17310,51 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_cbranch_vccz .LBB223_3 +; GFX90A-NEXT: s_cbranch_vccz .LBB223_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_vccz .LBB223_4 +; GFX90A-NEXT: s_cbranch_vccz .LBB223_7 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc +; GFX90A-NEXT: s_cbranch_execz .LBB223_8 +; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_cbranch_execz .LBB223_5 ; GFX90A-NEXT: s_branch .LBB223_6 -; GFX90A-NEXT: .LBB223_3: -; GFX90A-NEXT: ; implicit-def: $agpr0 -; GFX90A-NEXT: s_branch .LBB223_7 ; GFX90A-NEXT: .LBB223_4: ; GFX90A-NEXT: ; implicit-def: $agpr0 -; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private -; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX90A-NEXT: .LBB223_6: ; %Flow1 -; GFX90A-NEXT: s_cbranch_execnz .LBB223_8 -; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared +; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end +; GFX90A-NEXT: .LBB223_6: ; %atomicrmw.end ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB223_7: +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_cbranch_execz .LBB223_5 +; GFX90A-NEXT: s_branch .LBB223_6 ; ; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: ; GFX950: ; %bb.0: @@ -18168,16 +18162,13 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB235_5 ; GFX90A-NEXT: s_branch .LBB235_6 ; GFX90A-NEXT: .LBB235_3: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_branch .LBB235_7 ; GFX90A-NEXT: .LBB235_4: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 @@ -18185,12 +18176,13 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB235_6: ; %Flow1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execnz .LBB235_8 ; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -18204,7 +18196,6 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: @@ -18231,26 +18222,24 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB235_5 ; GFX950-NEXT: s_branch .LBB235_6 ; GFX950-NEXT: .LBB235_3: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_branch .LBB235_7 ; GFX950-NEXT: .LBB235_4: -; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-NEXT: .LBB235_6: ; %Flow1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execnz .LBB235_8 ; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -18264,7 +18253,6 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -18760,30 +18748,30 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB240_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB240_3 ; GFX950-NEXT: s_branch .LBB240_4 ; GFX950-NEXT: .LBB240_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB240_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB240_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 @@ -18942,30 +18930,30 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB242_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB242_3 ; GFX950-NEXT: s_branch .LBB242_4 ; GFX950-NEXT: .LBB242_2: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB242_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 -; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB242_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 42f76c4a10d2a..4bc6220b4d9a0 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -48,16 +48,17 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_1: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; CHECK-NEXT: s_cbranch_vccz .LBB0_17 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] +; CHECK-NEXT: s_cbranch_vccz .LBB0_18 ; CHECK-NEXT: .LBB0_2: ; %._crit_edge1942.i.i.i3548 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_7 Depth 2 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 ; CHECK-NEXT: ; %bb.3: ; %.preheader1868.i.i.i3244 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 vcc, s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB0_10 +; CHECK-NEXT: s_cbranch_vccz .LBB0_12 ; CHECK-NEXT: ; %bb.4: ; %.preheader1855.i.i.i3329.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] @@ -85,49 +86,54 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_5: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_5: ; in Loop: Header=BB0_7 Depth=2 +; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: .LBB0_6: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 -; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_13 +; CHECK-NEXT: .LBB0_7: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_accvgpr_read_b32 v27, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v26, a0 -; CHECK-NEXT: s_mov_b64 s[24:25], -1 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 -; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 -; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 -; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] +; CHECK-NEXT: ; %bb.8: ; %.lr.ph2070.i.i.i3291 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] -; CHECK-NEXT: s_cbranch_vccz .LBB0_5 -; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 -; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: s_cbranch_vccz .LBB0_10 +; CHECK-NEXT: ; %bb.9: ; %.preheader1856.preheader.i.i.i3325 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_branch .LBB0_5 -; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_7 Depth=2 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 +; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 +; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11] ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] -; CHECK-NEXT: s_branch .LBB0_15 -; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_branch .LBB0_16 +; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 -; CHECK-NEXT: s_branch .LBB0_15 -; CHECK-NEXT: .LBB0_11: ; %loop.exit.guard +; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[30:31] +; CHECK-NEXT: s_branch .LBB0_16 +; CHECK-NEXT: .LBB0_13: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25] -; CHECK-NEXT: s_cbranch_vccz .LBB0_13 -; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit +; CHECK-NEXT: s_cbranch_vccz .LBB0_15 +; CHECK-NEXT: ; %bb.14: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] @@ -139,24 +145,21 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_branch .LBB0_14 -; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_branch .LBB0_16 +; CHECK-NEXT: .LBB0_15: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 -; CHECK-NEXT: .LBB0_14: ; %Flow6 -; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] -; CHECK-NEXT: .LBB0_15: ; %Flow6 +; CHECK-NEXT: .LBB0_16: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 -; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 +; CHECK-NEXT: ; %bb.17: ; %._crit_edge2105.i.i.i2330 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 ; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] ; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock +; CHECK-NEXT: .LBB0_18: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm entry: br label %._crit_edge1942.i.i.i3548 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 36924175956cb..905c8e36dd692 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -467,7 +467,6 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -489,12 +488,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc @@ -509,7 +508,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec @@ -539,17 +538,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -561,8 +560,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc @@ -606,7 +605,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -615,6 +614,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51) ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec @@ -646,7 +646,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -655,7 +655,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc @@ -669,7 +669,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec @@ -698,7 +698,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -712,7 +712,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -727,7 +727,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -759,9 +759,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3) @@ -773,7 +773,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} @@ -833,14 +833,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF @@ -855,20 +855,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll index 618a9a8a05b18..b07294c71f608 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-av-classes.ll @@ -7,21 +7,20 @@ define i64 @test_temporal_divergence(i32 %arg) #0 { ; CHECK-LABEL: test_temporal_divergence: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_u32_e32 v0, 1, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_add_u32_e32 v2, 1, v0 +; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_u32_e32 v0, -1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5] +; CHECK-NEXT: v_add_u32_e32 v2, -1, v2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_mov_b64 s[4:5], 1 +; CHECK-NEXT: s_mov_b64 s[2:3], 1 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %end ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index 053cf0e1c6906..789eb8e480214 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -217,6 +217,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 @@ -236,7 +237,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 85bf05f39c684..8b6bb9b8c5fcd 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -101,54 +101,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -161,54 +224,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -334,54 +460,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v31 +; GFX90A-NEXT: v_mov_b32_e32 v1, v31 +; GFX90A-NEXT: v_mov_b32_e32 v2, v31 +; GFX90A-NEXT: v_mov_b32_e32 v3, v31 +; GFX90A-NEXT: v_mov_b32_e32 v4, v31 +; GFX90A-NEXT: v_mov_b32_e32 v5, v31 +; GFX90A-NEXT: v_mov_b32_e32 v6, v31 +; GFX90A-NEXT: v_mov_b32_e32 v7, v31 +; GFX90A-NEXT: v_mov_b32_e32 v8, v31 +; GFX90A-NEXT: v_mov_b32_e32 v9, v31 +; GFX90A-NEXT: v_mov_b32_e32 v10, v31 +; GFX90A-NEXT: v_mov_b32_e32 v11, v31 +; GFX90A-NEXT: v_mov_b32_e32 v12, v31 +; GFX90A-NEXT: v_mov_b32_e32 v13, v31 +; GFX90A-NEXT: v_mov_b32_e32 v14, v31 +; GFX90A-NEXT: v_mov_b32_e32 v15, v31 +; GFX90A-NEXT: v_mov_b32_e32 v16, v31 +; GFX90A-NEXT: v_mov_b32_e32 v17, v31 +; GFX90A-NEXT: v_mov_b32_e32 v18, v31 +; GFX90A-NEXT: v_mov_b32_e32 v19, v31 +; GFX90A-NEXT: v_mov_b32_e32 v20, v31 +; GFX90A-NEXT: v_mov_b32_e32 v21, v31 +; GFX90A-NEXT: v_mov_b32_e32 v22, v31 +; GFX90A-NEXT: v_mov_b32_e32 v23, v31 +; GFX90A-NEXT: v_mov_b32_e32 v24, v31 +; GFX90A-NEXT: v_mov_b32_e32 v25, v31 +; GFX90A-NEXT: v_mov_b32_e32 v26, v31 +; GFX90A-NEXT: v_mov_b32_e32 v27, v31 +; GFX90A-NEXT: v_mov_b32_e32 v28, v31 +; GFX90A-NEXT: v_mov_b32_e32 v29, v31 +; GFX90A-NEXT: v_mov_b32_e32 v30, v31 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -394,54 +583,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v2, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v31 +; GFX942-NEXT: v_mov_b32_e32 v1, v31 +; GFX942-NEXT: v_mov_b32_e32 v2, v31 +; GFX942-NEXT: v_mov_b32_e32 v3, v31 +; GFX942-NEXT: v_mov_b32_e32 v4, v31 +; GFX942-NEXT: v_mov_b32_e32 v5, v31 +; GFX942-NEXT: v_mov_b32_e32 v6, v31 +; GFX942-NEXT: v_mov_b32_e32 v7, v31 +; GFX942-NEXT: v_mov_b32_e32 v8, v31 +; GFX942-NEXT: v_mov_b32_e32 v9, v31 +; GFX942-NEXT: v_mov_b32_e32 v10, v31 +; GFX942-NEXT: v_mov_b32_e32 v11, v31 +; GFX942-NEXT: v_mov_b32_e32 v12, v31 +; GFX942-NEXT: v_mov_b32_e32 v13, v31 +; GFX942-NEXT: v_mov_b32_e32 v14, v31 +; GFX942-NEXT: v_mov_b32_e32 v15, v31 +; GFX942-NEXT: v_mov_b32_e32 v16, v31 +; GFX942-NEXT: v_mov_b32_e32 v17, v31 +; GFX942-NEXT: v_mov_b32_e32 v18, v31 +; GFX942-NEXT: v_mov_b32_e32 v19, v31 +; GFX942-NEXT: v_mov_b32_e32 v20, v31 +; GFX942-NEXT: v_mov_b32_e32 v21, v31 +; GFX942-NEXT: v_mov_b32_e32 v22, v31 +; GFX942-NEXT: v_mov_b32_e32 v23, v31 +; GFX942-NEXT: v_mov_b32_e32 v24, v31 +; GFX942-NEXT: v_mov_b32_e32 v25, v31 +; GFX942-NEXT: v_mov_b32_e32 v26, v31 +; GFX942-NEXT: v_mov_b32_e32 v27, v31 +; GFX942-NEXT: v_mov_b32_e32 v28, v31 +; GFX942-NEXT: v_mov_b32_e32 v29, v31 +; GFX942-NEXT: v_mov_b32_e32 v30, v31 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -561,54 +813,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -621,54 +936,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -825,85 +1203,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x43190000 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x43180000 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x43170000 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0x43160000 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43150000 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43140000 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43130000 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43120000 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43110000 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43100000 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0x430f0000 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0x430e0000 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0x430d0000 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0x430c0000 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430b0000 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430a0000 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0x43090000 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0x43080000 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0x43070000 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0x43060000 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43050000 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43040000 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43030000 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43020000 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43010000 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43000000 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0x42fe0000 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0x42fc0000 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0x42fa0000 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0x42f80000 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v31 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -916,85 +1326,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX942-NEXT: v_mov_b32_e32 v1, 0x43190000 -; GFX942-NEXT: v_mov_b32_e32 v2, 0x43180000 -; GFX942-NEXT: v_mov_b32_e32 v3, 0x43170000 -; GFX942-NEXT: v_mov_b32_e32 v4, 0x43160000 -; GFX942-NEXT: v_mov_b32_e32 v5, 0x43150000 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x43140000 -; GFX942-NEXT: v_mov_b32_e32 v7, 0x43130000 -; GFX942-NEXT: v_mov_b32_e32 v8, 0x43120000 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x43110000 -; GFX942-NEXT: v_mov_b32_e32 v10, 0x43100000 -; GFX942-NEXT: v_mov_b32_e32 v11, 0x430f0000 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x430e0000 -; GFX942-NEXT: v_mov_b32_e32 v13, 0x430d0000 -; GFX942-NEXT: v_mov_b32_e32 v14, 0x430c0000 -; GFX942-NEXT: v_mov_b32_e32 v15, 0x430b0000 -; GFX942-NEXT: v_mov_b32_e32 v16, 0x430a0000 -; GFX942-NEXT: v_mov_b32_e32 v17, 0x43090000 -; GFX942-NEXT: v_mov_b32_e32 v18, 0x43080000 -; GFX942-NEXT: v_mov_b32_e32 v19, 0x43070000 -; GFX942-NEXT: v_mov_b32_e32 v20, 0x43060000 -; GFX942-NEXT: v_mov_b32_e32 v21, 0x43050000 -; GFX942-NEXT: v_mov_b32_e32 v22, 0x43040000 -; GFX942-NEXT: v_mov_b32_e32 v23, 0x43030000 -; GFX942-NEXT: v_mov_b32_e32 v24, 0x43020000 -; GFX942-NEXT: v_mov_b32_e32 v25, 0x43010000 -; GFX942-NEXT: v_mov_b32_e32 v26, 0x43000000 -; GFX942-NEXT: v_mov_b32_e32 v27, 0x42fe0000 -; GFX942-NEXT: v_mov_b32_e32 v28, 0x42fc0000 -; GFX942-NEXT: v_mov_b32_e32 v29, 0x42fa0000 -; GFX942-NEXT: v_mov_b32_e32 v30, 0x42f80000 -; GFX942-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v31 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1114,54 +1556,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX90A-NEXT: v_and_b32_e32 v31, 0x3ff, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v31 +; GFX90A-NEXT: v_mov_b32_e32 v1, v31 +; GFX90A-NEXT: v_mov_b32_e32 v2, v31 +; GFX90A-NEXT: v_mov_b32_e32 v3, v31 +; GFX90A-NEXT: v_mov_b32_e32 v4, v31 +; GFX90A-NEXT: v_mov_b32_e32 v5, v31 +; GFX90A-NEXT: v_mov_b32_e32 v6, v31 +; GFX90A-NEXT: v_mov_b32_e32 v7, v31 +; GFX90A-NEXT: v_mov_b32_e32 v8, v31 +; GFX90A-NEXT: v_mov_b32_e32 v9, v31 +; GFX90A-NEXT: v_mov_b32_e32 v10, v31 +; GFX90A-NEXT: v_mov_b32_e32 v11, v31 +; GFX90A-NEXT: v_mov_b32_e32 v12, v31 +; GFX90A-NEXT: v_mov_b32_e32 v13, v31 +; GFX90A-NEXT: v_mov_b32_e32 v14, v31 +; GFX90A-NEXT: v_mov_b32_e32 v15, v31 +; GFX90A-NEXT: v_mov_b32_e32 v16, v31 +; GFX90A-NEXT: v_mov_b32_e32 v17, v31 +; GFX90A-NEXT: v_mov_b32_e32 v18, v31 +; GFX90A-NEXT: v_mov_b32_e32 v19, v31 +; GFX90A-NEXT: v_mov_b32_e32 v20, v31 +; GFX90A-NEXT: v_mov_b32_e32 v21, v31 +; GFX90A-NEXT: v_mov_b32_e32 v22, v31 +; GFX90A-NEXT: v_mov_b32_e32 v23, v31 +; GFX90A-NEXT: v_mov_b32_e32 v24, v31 +; GFX90A-NEXT: v_mov_b32_e32 v25, v31 +; GFX90A-NEXT: v_mov_b32_e32 v26, v31 +; GFX90A-NEXT: v_mov_b32_e32 v27, v31 +; GFX90A-NEXT: v_mov_b32_e32 v28, v31 +; GFX90A-NEXT: v_mov_b32_e32 v29, v31 +; GFX90A-NEXT: v_mov_b32_e32 v30, v31 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1174,54 +1679,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_and_b32_e32 v31, 0x3ff, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v31 +; GFX942-NEXT: v_mov_b32_e32 v1, v31 +; GFX942-NEXT: v_mov_b32_e32 v2, v31 +; GFX942-NEXT: v_mov_b32_e32 v3, v31 +; GFX942-NEXT: v_mov_b32_e32 v4, v31 +; GFX942-NEXT: v_mov_b32_e32 v5, v31 +; GFX942-NEXT: v_mov_b32_e32 v6, v31 +; GFX942-NEXT: v_mov_b32_e32 v7, v31 +; GFX942-NEXT: v_mov_b32_e32 v8, v31 +; GFX942-NEXT: v_mov_b32_e32 v9, v31 +; GFX942-NEXT: v_mov_b32_e32 v10, v31 +; GFX942-NEXT: v_mov_b32_e32 v11, v31 +; GFX942-NEXT: v_mov_b32_e32 v12, v31 +; GFX942-NEXT: v_mov_b32_e32 v13, v31 +; GFX942-NEXT: v_mov_b32_e32 v14, v31 +; GFX942-NEXT: v_mov_b32_e32 v15, v31 +; GFX942-NEXT: v_mov_b32_e32 v16, v31 +; GFX942-NEXT: v_mov_b32_e32 v17, v31 +; GFX942-NEXT: v_mov_b32_e32 v18, v31 +; GFX942-NEXT: v_mov_b32_e32 v19, v31 +; GFX942-NEXT: v_mov_b32_e32 v20, v31 +; GFX942-NEXT: v_mov_b32_e32 v21, v31 +; GFX942-NEXT: v_mov_b32_e32 v22, v31 +; GFX942-NEXT: v_mov_b32_e32 v23, v31 +; GFX942-NEXT: v_mov_b32_e32 v24, v31 +; GFX942-NEXT: v_mov_b32_e32 v25, v31 +; GFX942-NEXT: v_mov_b32_e32 v26, v31 +; GFX942-NEXT: v_mov_b32_e32 v27, v31 +; GFX942-NEXT: v_mov_b32_e32 v28, v31 +; GFX942-NEXT: v_mov_b32_e32 v29, v31 +; GFX942-NEXT: v_mov_b32_e32 v30, v31 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1382,53 +1950,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1 +; GFX90A-NEXT: v_mov_b32_e32 v31, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s1 +; GFX90A-NEXT: v_mov_b32_e32 v7, s1 +; GFX90A-NEXT: v_mov_b32_e32 v8, s1 +; GFX90A-NEXT: v_mov_b32_e32 v9, s1 +; GFX90A-NEXT: v_mov_b32_e32 v10, s1 +; GFX90A-NEXT: v_mov_b32_e32 v11, s1 +; GFX90A-NEXT: v_mov_b32_e32 v12, s1 +; GFX90A-NEXT: v_mov_b32_e32 v13, s1 +; GFX90A-NEXT: v_mov_b32_e32 v14, s1 +; GFX90A-NEXT: v_mov_b32_e32 v15, s1 +; GFX90A-NEXT: v_mov_b32_e32 v16, s1 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s1 +; GFX90A-NEXT: v_mov_b32_e32 v19, s1 +; GFX90A-NEXT: v_mov_b32_e32 v20, s1 +; GFX90A-NEXT: v_mov_b32_e32 v21, s1 +; GFX90A-NEXT: v_mov_b32_e32 v22, s1 +; GFX90A-NEXT: v_mov_b32_e32 v23, s1 +; GFX90A-NEXT: v_mov_b32_e32 v24, s1 +; GFX90A-NEXT: v_mov_b32_e32 v25, s1 +; GFX90A-NEXT: v_mov_b32_e32 v26, s1 +; GFX90A-NEXT: v_mov_b32_e32 v27, s1 +; GFX90A-NEXT: v_mov_b32_e32 v28, s1 +; GFX90A-NEXT: v_mov_b32_e32 v29, s1 +; GFX90A-NEXT: v_mov_b32_e32 v30, s1 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1443,53 +2075,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, s1 +; GFX942-NEXT: v_mov_b32_e32 v31, s1 +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s1 +; GFX942-NEXT: v_mov_b32_e32 v5, s1 +; GFX942-NEXT: v_mov_b32_e32 v6, s1 +; GFX942-NEXT: v_mov_b32_e32 v7, s1 +; GFX942-NEXT: v_mov_b32_e32 v8, s1 +; GFX942-NEXT: v_mov_b32_e32 v9, s1 +; GFX942-NEXT: v_mov_b32_e32 v10, s1 +; GFX942-NEXT: v_mov_b32_e32 v11, s1 +; GFX942-NEXT: v_mov_b32_e32 v12, s1 +; GFX942-NEXT: v_mov_b32_e32 v13, s1 +; GFX942-NEXT: v_mov_b32_e32 v14, s1 +; GFX942-NEXT: v_mov_b32_e32 v15, s1 +; GFX942-NEXT: v_mov_b32_e32 v16, s1 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s1 +; GFX942-NEXT: v_mov_b32_e32 v19, s1 +; GFX942-NEXT: v_mov_b32_e32 v20, s1 +; GFX942-NEXT: v_mov_b32_e32 v21, s1 +; GFX942-NEXT: v_mov_b32_e32 v22, s1 +; GFX942-NEXT: v_mov_b32_e32 v23, s1 +; GFX942-NEXT: v_mov_b32_e32 v24, s1 +; GFX942-NEXT: v_mov_b32_e32 v25, s1 +; GFX942-NEXT: v_mov_b32_e32 v26, s1 +; GFX942-NEXT: v_mov_b32_e32 v27, s1 +; GFX942-NEXT: v_mov_b32_e32 v28, s1 +; GFX942-NEXT: v_mov_b32_e32 v29, s1 +; GFX942-NEXT: v_mov_b32_e32 v30, s1 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1646,56 +2342,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1709,56 +2467,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2072,73 +2892,138 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] -; GFX908-NEXT: s_endpgm -; -; GFX90A-LABEL: test_mfma_loop_agpr_init: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: test_mfma_loop_agpr_init: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v33 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2158,49 +3043,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v33, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v33 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v33, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2615,51 +3565,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2697,51 +3710,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2875,51 +3951,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2957,51 +4096,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 2462414992e36..12efca7dcadb5 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 098a60dd61a1c..1156f2718cf1e 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -722,8 +722,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2800, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 ; GFX90A-NEXT: s_movk_i32 s3, 0x1000 ; GFX90A-NEXT: s_movk_i32 s4, 0x2000 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index f196004e7660b..6b0ede1ac3ab8 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -113,16 +113,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 +; GLOBALNESS1-NEXT: s_branch .LBB1_4 +; GLOBALNESS1-NEXT: .LBB1_3: ; %bb73.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 @@ -171,10 +171,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] @@ -183,7 +181,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off @@ -212,7 +210,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 @@ -273,7 +271,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_14 -; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 +; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_4 +; GLOBALNESS1-NEXT: s_branch .LBB1_29 +; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 @@ -283,25 +285,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 +; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i +; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i +; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 -; GLOBALNESS1-NEXT: .LBB1_28: ; %bb73.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off -; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 @@ -424,16 +422,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 +; GLOBALNESS0-NEXT: s_branch .LBB1_4 +; GLOBALNESS0-NEXT: .LBB1_3: ; %bb73.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 @@ -482,10 +480,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] @@ -494,7 +490,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off @@ -524,7 +520,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 @@ -585,7 +581,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_14 -; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 +; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_4 +; GLOBALNESS0-NEXT: s_branch .LBB1_29 +; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 @@ -593,25 +593,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 ; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 +; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i +; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i +; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 -; GLOBALNESS0-NEXT: .LBB1_28: ; %bb73.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off -; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 From fb45d3b0952b2645e2b1f2de51f14bb956165748 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 21 Nov 2025 14:07:24 +0000 Subject: [PATCH 3/3] clang-format --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 3e4b25dd2f663..39a6a7762eea5 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -857,7 +857,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) || - RC0 == &AMDGPU::VReg_1RegClass) { + RC0 == &AMDGPU::VReg_1RegClass) { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); TII->legalizeOperands(MI, MDT); }